In [118]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

domains = ['books', 'dvd', 'electronics', 'kitchen', 'movies']
adj_csvs = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Data/Sentiment/Raw/"
cols = ['id', 'domain_label', 'review', 'sentiment_label']
reviews_df = pd.DataFrame(columns=cols)

domain_id = {}

for domain in domains:
    cur_adj_df = pd.read_csv(adj_csvs + domain + "/adj_all.csv")
    cur_adj_df = cur_adj_df[:1000]
    reviews_df = pd.concat([reviews_df, cur_adj_df[cols]])
    domain_id[domain] = cur_adj_df['domain_label'].unique()[0]

reviews_df.head()

Unnamed: 0,id,domain_label,review,sentiment_label
0,1,1,"Horrible book , horrible . THis book was horri...",0
1,2,1,shallow self - indulgence I like to use the Am...,0
2,3,1,"Horrible book , horrible . THis book was horri...",0
3,4,1,Disappointment I 'm not sure who 's writing th...,0
4,5,1,A Disappointing Mess I picked up the first boo...,0


In [119]:
print(reviews_df.domain_label.unique())
print(len(reviews_df))
print(domain_id)

[1 4 2 3 0]
5000
{'books': 1, 'dvd': 4, 'electronics': 2, 'kitchen': 3, 'movies': 0}


In [120]:
def get_topic_distribution(data, num_topics, num_features, num_top_words):
    tf_vectorizer = CountVectorizer(max_features=num_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online',
                                    learning_offset=50., random_state=42, n_jobs=1).fit(tf)

    display_topics(lda, tf_feature_names, num_top_words)
    return lda.transform(tf)

In [121]:
topic_words_dict = {}
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\n Topic {}:".format(topic_idx+1))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        topic_words_dict[topic_idx+1] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

In [122]:
## LDA Model Hyperparams
num_topics = 50
num_features = 500
num_top_words = 10


domain_topic_dist = get_topic_distribution(reviews_df['review'].tolist(), num_topics, num_features, num_top_words)


 Topic 1:
set box wait 20 making flat worth longer disappointment spend

 Topic 2:
pan phone computer work does use stick non battery problem

 Topic 3:
great use just months problem bought time good years ago

 Topic 4:
classic stories know great really book definitely reading writing long

 Topic 5:
item dull returned expect given fit did ridiculous run matter

 Topic 6:
kids crap turned fun children making point needs understand truly

 Topic 7:
dvd version video player original screen release quality features cover

 Topic 8:
book real second school author going page shows past light

 Topic 9:
machine software uses issues using help problems makes device bought

 Topic 10:
mind fine despite pages author lost books book read especially

 Topic 11:
book reading information read quot books better author know does

 Topic 12:
just did know ll does ve think got times work

 Topic 13:
product buy amazon bought plastic did reviews cheap work ve

 Topic 14:
does man just woman story wome

In [124]:
topics_df = pd.DataFrame(np.array(domain_topic_dist),
                             columns=['topic_' + str(i + 1) for i in range(num_topics)])
# print(topics_df)
reviews_df.reset_index(inplace=True)
df = pd.concat([reviews_df, topics_df], axis=1)


for i in range(num_topics):
    topic_average = df['topic_' + str(i + 1)].mean()
    df['topic_bin_' + str(i + 1)] = (df['topic_' + str(i + 1)] > topic_average).astype(int)
    print(df['topic_bin_' + str(i + 1)].mean())

0.1312
0.1206
0.2126
0.1134
0.07
0.1036
0.1458
0.3334
0.0704
0.3334
0.1334
0.224
0.2064
0.1692
0.3334
0.1494
0.1102
0.1844
0.0968
0.1772
0.2162
0.334
0.0916
0.128
0.3334
0.3334
0.1706
0.0932
0.1474
0.1134
0.0578
0.3334
0.3334
0.3334
0.0668
0.1874
0.167
0.179
0.1622
0.1062
0.3334
0.3334
0.1038
0.0852
0.0856
0.2326
0.1802
0.1254
0.1944
0.093


In [125]:
df.head()

Unnamed: 0,index,id,domain_label,review,sentiment_label,topic_1,topic_2,topic_3,topic_4,topic_5,...,topic_bin_41,topic_bin_42,topic_bin_43,topic_bin_44,topic_bin_45,topic_bin_46,topic_bin_47,topic_bin_48,topic_bin_49,topic_bin_50
0,0,1,1,"Horrible book , horrible . THis book was horri...",0,0.000571,0.000571,0.000571,0.000571,0.000571,...,0,0,1,0,0,0,1,0,0,0
1,1,2,1,shallow self - indulgence I like to use the Am...,0,0.000833,0.000833,0.000833,0.000833,0.000833,...,0,0,1,0,0,0,0,0,0,0
2,2,3,1,"Horrible book , horrible . THis book was horri...",0,0.000571,0.000571,0.000571,0.000571,0.000571,...,0,0,1,0,0,0,1,0,0,0
3,3,4,1,Disappointment I 'm not sure who 's writing th...,0,0.000833,0.000833,0.000833,0.000833,0.000833,...,0,0,0,0,0,1,0,0,0,0
4,4,5,1,A Disappointing Mess I picked up the first boo...,0,0.000208,0.000208,0.000208,0.000208,0.000208,...,0,0,1,1,0,1,0,0,0,0


In [131]:
topic_cols = ['topic_' + str(i + 1) for i in range(num_topics)]

treated_control_topics_dict = {}

for domain in domains:
    topic_means_domain = df[df['domain_label'] == domain_id[domain]][topic_cols].mean()
    topic_means_other_domains = df[df['domain_label'] != domain_id[domain]][topic_cols].mean()
    
    topic_means = topic_means_domain.subtract(topic_means_other_domain)
    topic_means = topic_means.sort_values(ascending=False)
    
    not_control_topic = topic_means[1:].idxmax(axis=1)
    control_topic_1 = topic_means[2:].idxmax(axis=1)
    control_topic_2 = topic_means[3:].idxmax(axis=1)
    control_topic_3 = topic_means[4:].idxmax(axis=1)
    treated_topic = df[df['domain_label'] == domain_id[domain]][topic_cols].mean().idxmax(axis=1)
    
    print(domain, domain_id[domain])
    print(treated_topic, topic_words_dict[int(treated_topic.split("_")[1])])
    print(not_control_topic, topic_words_dict[int(not_control_topic.split("_")[1])])
    print(control_topic_1, topic_words_dict[int(control_topic_1.split("_")[1])])
    print(control_topic_2, topic_words_dict[int(control_topic_2.split("_")[1])])
    print(control_topic_3, topic_words_dict[int(control_topic_3.split("_")[1])])
    
    treated_control_topics_dict[domain] = {"treated_topic": treated_topic,
                                          "control_topics": [control_topic_1, control_topic_2, control_topic_3],
                                          "not_control_topic": not_control_topic}

books 1
topic_29 book read books author pages novel writing reader history interesting
topic_11 book reading information read quot books better author know does
topic_46 like just really did characters story character love little make
topic_18 people god says mr life like world person american way
topic_38 boring long time end story rest stop slow minutes good
dvd 4
topic_22 movie film like movies acting bad watch just plot scenes
topic_7 dvd version video player original screen release quality features cover
topic_46 like just really did characters story character love little make
topic_14 does man just woman story women way stop time like
topic_38 boring long time end story rest stop slow minutes good
electronics 2
topic_3 great use just months problem bought time good years ago
topic_13 product buy amazon bought plastic did reviews cheap work ve
topic_49 day black sound hours like just minutes bread went getting
topic_12 just did know ll does ve think got times work
topic_2 pan phon

In [None]:
treated_control_topics_dict

In [108]:
topic_words_dict

{0: 'friends simply hand sex number problem seriously does needs takes',
 1: 'music tv dvd watching shows shot opening television watched released',
 2: 'characters character story plot scenes main really interesting scene just',
 3: 'days experience song happy easy fast especially amazing simple eyes',
 4: 'old girl year boy star relationship age man friend horrible',
 5: 'watch want review hope rating reason better new people life',
 6: 'humor moments supposed extremely yes stop somewhat says making self',
 7: 'work budget camera low sound minutes use set does just',
 8: 'excellent play mr art perfect work usual beautiful enjoy able',
 9: 'men women girls middle female non miss sex beautiful group',
 10: 'amazing half god chance crap decent totally act sort gets',
 11: 'fan recommend highly episodes disappointed loved fans saw really thought',
 12: 'did time know just way came yes like people end',
 13: 'history strong important documentary major events earth enjoyed certainly world'

In [130]:
pd.set_option('display.max_columns', None)  
df[topic_cols].corr()

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49,topic_50
topic_1,1.0,0.015631,0.028626,-0.006209,-0.007352,0.001805,-0.003393,0.076654,0.00283,0.076654,-0.016388,-0.016294,0.019489,-0.033376,0.076654,0.006582,0.007022,-0.037352,0.013394,-0.023722,-0.037856,-0.101799,0.03816,0.030543,0.076654,0.076654,-0.001043,-0.008021,-0.041768,0.010548,0.004785,0.076654,0.076654,0.076654,0.017454,-0.006757,0.001781,0.013268,-0.009512,-0.000684,0.076654,0.076654,-0.016871,0.003424,0.002136,-0.054578,0.006989,0.032005,-0.008496,0.006994
topic_2,0.015631,1.0,0.021208,-0.025064,0.014511,-0.007056,-0.041055,0.108241,0.033968,0.108241,-0.05759,-0.008422,0.012865,-0.072084,0.108241,0.034738,0.019292,-0.047581,0.033981,0.026898,-0.048815,-0.155684,-0.036734,-0.010035,0.108241,0.108241,0.056607,0.033614,-0.066181,0.004592,0.003773,0.108241,0.108241,0.108241,0.034301,0.011721,0.001067,-0.056879,-0.002839,0.019088,0.108241,0.108241,-0.027581,-0.020932,0.005325,-0.080121,0.028389,-0.001429,0.008198,0.043817
topic_3,0.028626,0.021208,1.0,-0.049217,0.01247,-0.032923,-0.041131,0.019297,0.052691,0.019297,-0.066004,-0.038913,-0.010929,-0.12183,0.019297,-0.038763,0.087639,-0.089063,0.032806,-0.031261,-0.061451,-0.234123,-0.048698,-0.003144,0.019297,0.019297,0.018996,-0.018079,-0.08614,-0.013395,0.076233,0.019297,0.019297,0.019297,0.045504,-0.010457,0.009328,-0.07141,-0.032683,-0.009993,0.019297,0.019297,-0.04913,-0.030914,0.000201,-0.142923,-0.039138,-0.024248,0.00209,0.023033
topic_4,-0.006209,-0.025064,-0.049217,1.0,-0.001797,0.012701,0.003486,0.070987,-0.018175,0.070987,0.039167,-0.039562,-0.015822,-0.000303,0.070987,-0.023648,-0.013166,-0.009734,0.008884,0.00215,-0.007899,-0.003591,0.041712,-0.015697,0.070987,0.070987,-0.025113,-0.001137,0.041104,-0.016521,-0.020433,0.070987,0.070987,0.070987,-0.024171,-0.019474,-0.007562,0.002101,-0.000332,-0.002249,0.070987,0.070987,0.007633,-0.007576,-0.015498,0.071827,-0.006483,0.013466,-0.038841,-0.017977
topic_5,-0.007352,0.014511,0.01247,-0.001797,1.0,0.001678,-0.018017,0.181906,0.016439,0.181906,-0.028008,-0.018028,0.042898,-0.052816,0.181906,-0.016613,0.047343,-0.047384,0.056615,-0.020024,-0.036383,-0.089209,-0.012637,0.113488,0.181906,0.181906,0.008161,0.033581,-0.047639,0.007522,0.004698,0.181906,0.181906,0.181906,0.020375,0.006339,0.038091,-0.009762,-0.010679,0.043324,0.181906,0.181906,0.016517,0.005491,0.005829,-0.049234,-0.006443,0.00409,-0.030528,0.022925
topic_6,0.001805,-0.007056,-0.032923,0.012701,0.001678,1.0,0.001878,0.022664,-0.017324,0.022664,0.009984,-0.003673,-0.033337,0.004266,0.022664,-0.009978,-0.033295,-0.002838,-0.022948,-0.00932,0.004525,-0.00489,-0.016725,-0.026775,0.022664,0.022664,0.000256,-0.029345,-0.018035,-0.010899,-0.024064,0.022664,0.022664,0.022664,-0.015715,-0.015479,-0.028385,0.027872,0.003352,0.003479,0.022664,0.022664,-0.00558,0.012011,0.000186,0.006165,-0.021126,-0.014338,-0.003822,-0.042373
topic_7,-0.003393,-0.041055,-0.041131,0.003486,-0.018017,0.001878,1.0,0.00973,-0.022472,0.00973,-0.053583,-0.037476,-0.035139,-0.042005,0.00973,-0.03944,-0.042365,-0.02714,-0.020249,-0.020967,-0.002732,-0.024991,0.019239,-0.010506,0.00973,0.00973,-0.024478,-0.0302,-0.066682,0.001787,-0.040693,0.00973,0.00973,0.00973,-0.017412,-0.058404,0.017299,-0.018622,-0.01042,-0.013953,0.00973,0.00973,-0.020205,0.001378,0.030674,-0.04189,-0.009993,0.005939,-0.044896,-0.013085
topic_8,0.076654,0.108241,0.019297,0.070987,0.181906,0.022664,0.00973,1.0,0.045323,1.0,0.042915,-0.083607,0.123913,-0.128724,1.0,0.065639,0.072172,-0.102426,0.115929,-0.015979,0.025119,-0.243487,0.031152,0.105614,1.0,1.0,0.085734,0.058944,-0.045945,0.044392,0.033617,1.0,1.0,1.0,0.068,0.079905,0.0921,0.047363,0.051497,0.093967,1.0,1.0,0.012355,0.084734,0.060834,-0.097571,0.068168,0.153494,-0.063946,0.022924
topic_9,0.00283,0.033968,0.052691,-0.018175,0.016439,-0.017324,-0.022472,0.045323,1.0,0.045323,-0.028669,0.001087,0.054953,-0.060207,0.045323,0.006242,0.047289,-0.037412,0.024749,-0.001903,0.001904,-0.122574,-0.030918,0.012752,0.045323,0.045323,0.010551,-0.00221,-0.036851,0.003129,0.065335,0.045323,0.045323,0.045323,0.0632,0.000959,0.018783,-0.027969,-0.018055,-0.000548,0.045323,0.045323,-0.019592,0.000967,0.001258,-0.063095,-0.00269,-0.004353,0.029464,0.046534
topic_10,0.076654,0.108241,0.019297,0.070987,0.181906,0.022664,0.00973,1.0,0.045323,1.0,0.042915,-0.083607,0.123913,-0.128724,1.0,0.065639,0.072172,-0.102426,0.115929,-0.015979,0.025119,-0.243487,0.031152,0.105614,1.0,1.0,0.085734,0.058944,-0.045945,0.044392,0.033617,1.0,1.0,1.0,0.068,0.079905,0.0921,0.047363,0.051497,0.093967,1.0,1.0,0.012355,0.084734,0.060834,-0.097571,0.068168,0.153494,-0.063946,0.022924
