In [25]:
import json
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from datasets.utils import split_data
from constants import SENTIMENT_RAW_DATA_DIR, SENTIMENT_DOMAINS, RANDOM_SEED

domain_id = {domain: idx for idx, domain in enumerate(SENTIMENT_DOMAINS)}

unified_csv = "/home/nadavo/GoogleDrive/AmirNadav/CausaLM/Data/Sentiment/Raw/unified/adj_all.csv"
cols = ['id', 'domain_label', 'review', 'review_len', 'sentiment_label']

reviews_df = pd.read_csv(unified_csv, header=0, encoding="utf-8", usecols=cols)
reviews_df.head()

Unnamed: 0,id,domain_label,review,review_len,sentiment_label
0,9,0,""" Tokyo Eyes "" tells of a 17 year old Japanese...",156,0
1,28,0,"Uhhh ... so , did they even have writers for t...",825,0
2,82,0,"How sad it is when a film as wonderful as "" Ju...",131,0
3,104,0,Worst pile of drivel to date ! Everyone involv...,47,0
4,127,0,I do n't even understand what they tried to ac...,195,0


In [26]:
print(reviews_df.domain_label.unique())
print(len(reviews_df))
print(domain_id)

[0 1 2 3 4]
10000
{'movies': 0, 'books': 1, 'electronics': 2, 'kitchen': 3, 'dvd': 4}


In [27]:
def get_topic_distribution(data, num_topics, num_features, num_top_words):
    tf_vectorizer = CountVectorizer(max_features=num_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online',
                                    learning_offset=50., random_state=RANDOM_SEED, n_jobs=1).fit(tf)

    display_topics(lda, tf_feature_names, num_top_words)
    return lda.transform(tf)

In [28]:
topic_words_dict = {}
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\n Topic {}:".format(topic_idx+1))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        topic_words_dict[topic_idx+1] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

In [29]:
## LDA Model Hyperparams
num_topics = 50
num_features = 500
num_top_words = 10


domain_topic_dist = get_topic_distribution(reviews_df['review'].tolist(), num_topics, num_features, num_top_words)


 Topic 1:
like coffee just food world people way look makes good

 Topic 2:
new nice time come size cooking need stay large little

 Topic 3:
quality poor good days major high performance pay work fit

 Topic 4:
10 hours model worst week try bought purchase seen enjoy

 Topic 5:
great machine space works price deal little perfect best really

 Topic 6:
kids nt service loved took worth said style saying wrong

 Topic 7:
special computer amazing cool effects features looking definitely value course

 Topic 8:
life man people world men women john human young god

 Topic 9:
version screen play design light perfect color dark beautiful order

 Topic 10:
evil fit power experience books huge entertaining gets say box

 Topic 11:
product speakers cover purchased huge use recommend awful good did

 Topic 12:
wonderful highly recommend received recommended stop non gives truly look

 Topic 13:
just ve did time like tried think thing really know

 Topic 14:
heat house screen version despite wife

In [30]:
topics_df = pd.DataFrame(np.array(domain_topic_dist),
                             columns=['topic_' + str(i + 1) for i in range(num_topics)])
# print(topics_df)
reviews_df.reset_index(inplace=True)
df = pd.concat([reviews_df, topics_df], axis=1)


for i in range(num_topics):
    topic_average = df['topic_' + str(i + 1)].mean()
    df['topic_bin_' + str(i + 1)] = (df['topic_' + str(i + 1)] > topic_average).astype(int)
    print(df['topic_bin_' + str(i + 1)].mean())

0.1884
0.2133
0.1546
0.1615
0.2194
0.1348
0.1107
0.192
0.1763
0.3349
0.133
0.1583
0.2614
0.3349
0.1347
0.3349
0.2013
0.2044
0.1155
0.1143
0.2454
0.1782
0.148
0.1726
0.1455
0.1832
0.1555
0.3349
0.1212
0.1651
0.3349
0.099
0.1761
0.3349
0.1613
0.1185
0.2132
0.0665
0.1842
0.1865
0.177
0.129
0.1496
0.1906
0.1961
0.1517
0.1791
0.3349
0.3349
0.3349


In [31]:
df.head()

Unnamed: 0,index,id,domain_label,review,review_len,sentiment_label,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49,topic_50,topic_bin_1,topic_bin_2,topic_bin_3,topic_bin_4,topic_bin_5,topic_bin_6,topic_bin_7,topic_bin_8,topic_bin_9,topic_bin_10,topic_bin_11,topic_bin_12,topic_bin_13,topic_bin_14,topic_bin_15,topic_bin_16,topic_bin_17,topic_bin_18,topic_bin_19,topic_bin_20,topic_bin_21,topic_bin_22,topic_bin_23,topic_bin_24,topic_bin_25,topic_bin_26,topic_bin_27,topic_bin_28,topic_bin_29,topic_bin_30,topic_bin_31,topic_bin_32,topic_bin_33,topic_bin_34,topic_bin_35,topic_bin_36,topic_bin_37,topic_bin_38,topic_bin_39,topic_bin_40,topic_bin_41,topic_bin_42,topic_bin_43,topic_bin_44,topic_bin_45,topic_bin_46,topic_bin_47,topic_bin_48,topic_bin_49,topic_bin_50
0,0,9,0,""" Tokyo Eyes "" tells of a 17 year old Japanese...",156,0,0.000769,0.043984,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.060822,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.039815,0.102106,0.039231,0.000769,0.000769,0.000769,0.000769,0.13295,0.000769,0.000769,0.123705,0.000769,0.000769,0.072567,0.051228,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.25697,0.000769,0.046621,0.000769,0.000769,0.000769,0.000769,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,1,28,0,"Uhhh ... so , did they even have writers for t...",825,0,0.00015,0.088155,0.00015,0.00015,0.00015,0.00015,0.00015,0.014309,0.00015,0.00015,0.00015,0.00015,0.128773,0.00015,0.022692,0.00015,0.00015,0.00015,0.007663,0.00015,0.36555,0.075159,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.046504,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0.245029,0.00015,0.00015,0.00015,0.00015,0.00015,0.00015,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,82,0,"How sad it is when a film as wonderful as "" Ju...",131,0,0.000833,0.000833,0.000833,0.000833,0.039427,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.211066,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.307593,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.046448,0.000833,0.000833,0.357966,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
3,3,104,0,Worst pile of drivel to date ! Everyone involv...,47,0,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.910909,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0.001818,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
4,4,127,0,I do n't even understand what they tried to ac...,195,0,0.000444,0.037215,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.144824,0.000444,0.022667,0.000444,0.000444,0.000444,0.000444,0.000444,0.77485,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0.000444,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
topic_cols = ['topic_' + str(i + 1) for i in range(num_topics)]

treated_control_topics_dict = {}

for domain in SENTIMENT_DOMAINS:
    topic_means_domain = df[df['domain_label'] == domain_id[domain]][topic_cols].mean()
    topic_means_other_domains = df[df['domain_label'] != domain_id[domain]][topic_cols].mean()
    
    topic_means = topic_means_domain.subtract(topic_means_other_domains)
    topic_means = topic_means.sort_values(ascending=False)
    
    not_control_topic = topic_means[1:].idxmax(axis=1)
    control_topic_1 = topic_means[2:].idxmax(axis=1)
    control_topic_2 = topic_means[3:].idxmax(axis=1)
    control_topic_3 = topic_means[4:].idxmax(axis=1)
    treated_topic = df[df['domain_label'] == domain_id[domain]][topic_cols].mean().idxmax(axis=1)
    
    print(domain, domain_id[domain])
    print(treated_topic, topic_words_dict[int(treated_topic.split("_")[1])])
    print(not_control_topic, topic_words_dict[int(not_control_topic.split("_")[1])])
    print(control_topic_1, topic_words_dict[int(control_topic_1.split("_")[1])])
    print(control_topic_2, topic_words_dict[int(control_topic_2.split("_")[1])])
    print(control_topic_3, topic_words_dict[int(control_topic_3.split("_")[1])])
    
    treated_control_topics_dict[domain] = {"treated_topic": treated_topic,
                                          "control_topics": [control_topic_1, control_topic_2, control_topic_3],
                                          "not_control_topic": not_control_topic}

movies 0
topic_21 movie like movies just funny comedy watch character really did
topic_44 film films story time director characters character scenes best scene
topic_22 good movie really pretty action liked better like story plot
topic_8 life man people world men women john human young god
topic_35 story children little home love characters fun family great value
books 1
topic_37 book read books reading author novel story like writing written
topic_8 life man people world men women john human young god
topic_35 story children little home love characters fun family great value
topic_1 like coffee just food world people way look makes good
topic_38 quot english language personal people history reading written material live
electronics 2
topic_13 just ve did time like tried think thing really know
topic_18 dvd buy bought cable price amazon store just good like
topic_29 sound music ipod brand headphones better control good head low
topic_11 product speakers cover purchased huge use recomme

In [33]:
treated_control_topics_dict

{'movies': {'treated_topic': 'topic_21',
  'control_topics': ['topic_22', 'topic_8', 'topic_35'],
  'not_control_topic': 'topic_44'},
 'books': {'treated_topic': 'topic_37',
  'control_topics': ['topic_35', 'topic_1', 'topic_38'],
  'not_control_topic': 'topic_8'},
 'electronics': {'treated_topic': 'topic_13',
  'control_topics': ['topic_29', 'topic_11', 'topic_47'],
  'not_control_topic': 'topic_18'},
 'kitchen': {'treated_topic': 'topic_13',
  'control_topics': ['topic_40', 'topic_1', 'topic_5'],
  'not_control_topic': 'topic_2'},
 'dvd': {'treated_topic': 'topic_21',
  'control_topics': ['topic_22', 'topic_18', 'topic_24'],
  'not_control_topic': 'topic_44'}}

In [34]:
topic_words_dict

{1: 'like coffee just food world people way look makes good',
 2: 'new nice time come size cooking need stay large little',
 3: 'quality poor good days major high performance pay work fit',
 4: '10 hours model worst week try bought purchase seen enjoy',
 5: 'great machine space works price deal little perfect best really',
 6: 'kids nt service loved took worth said style saying wrong',
 7: 'special computer amazing cool effects features looking definitely value course',
 8: 'life man people world men women john human young god',
 9: 'version screen play design light perfect color dark beautiful order',
 10: 'evil fit power experience books huge entertaining gets say box',
 11: 'product speakers cover purchased huge use recommend awful good did',
 12: 'wonderful highly recommend received recommended stop non gives truly look',
 13: 'just ve did time like tried think thing really know',
 14: 'heat house screen version despite wife horror completely right things',
 15: 'half piece buying 

In [35]:
pd.set_option('display.max_columns', None)  
df[topic_cols].corr()

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49,topic_50
topic_1,1.0,-0.019917,-0.04293,-0.026991,-0.02975,-0.030385,-0.027497,-0.012261,-0.037748,-0.084662,-0.042411,-0.031942,-0.065881,-0.084662,-0.01822,-0.084662,-0.037598,-0.060524,-0.035197,0.012541,-0.117888,-0.077527,-0.042032,-0.033612,-0.022023,-0.029021,-0.037055,-0.084662,-0.031725,-0.034019,-0.084662,0.016375,-0.040795,-0.084662,-0.02787,-0.015234,-0.023873,0.000561,-0.062612,-0.032198,-0.023241,-0.0356,0.010139,-0.068311,-0.044277,-0.027819,-0.015621,-0.084662,-0.084662,-0.084662
topic_2,-0.019917,1.0,-0.009204,-0.013369,0.020607,-0.027686,-0.002186,-0.0558,0.021799,-0.001562,4.3e-05,0.008118,-0.049007,-0.001562,0.01903,-0.001562,-0.001646,-0.015903,-0.008563,0.048533,-0.147577,-0.079085,0.031105,-0.013154,-0.009783,0.045611,-0.009885,-0.001562,0.013399,0.029521,-0.001562,-0.024389,-0.02042,-0.001562,-0.041527,-0.016419,-0.067415,-0.02454,-0.022757,-0.010898,-0.018248,-0.02676,0.004651,-0.104318,-0.013195,-0.037744,0.065232,-0.001562,-0.001562,-0.001562
topic_3,-0.04293,-0.009204,1.0,0.004875,0.014798,-0.000881,-0.012822,-0.063147,0.027079,0.076315,0.053805,-0.010267,-0.034822,0.076315,0.020273,0.076315,0.000955,0.024091,0.023449,-0.013246,-0.102966,-0.030408,0.037394,-0.018523,0.005876,-0.018237,0.022366,0.076315,0.104264,0.026146,0.076315,-0.032009,0.049632,0.076315,-0.03632,-0.02108,-0.080803,-0.016518,0.005246,0.004416,0.012651,-0.022916,0.005661,-0.071496,-0.029592,0.003742,0.016503,0.076315,0.076315,0.076315
topic_4,-0.026991,-0.013369,0.004875,1.0,-0.014798,0.022342,-0.003958,-0.057613,-0.01005,0.073499,0.027076,-0.003307,0.006684,0.073499,0.03553,0.073499,-0.006643,-0.008104,0.049123,0.008995,-0.07563,-0.03271,0.045706,-0.002816,-0.014455,-0.00626,0.017684,0.073499,-0.001276,-0.001438,0.073499,-0.023839,-0.011511,0.073499,-0.050712,-0.009506,-0.062742,-0.023119,-0.013863,0.01198,0.034444,-0.012848,0.006894,-0.054793,-0.012162,0.007435,0.001005,0.073499,0.073499,0.073499
topic_5,-0.02975,0.020607,0.014798,-0.014798,1.0,-0.013988,0.008088,-0.087595,-0.014973,0.124839,0.061413,0.032648,-0.065605,0.124839,-0.021907,0.124839,-0.009858,0.009139,-0.002927,0.062106,-0.126895,-0.077607,0.026437,-0.012962,0.00642,0.037937,0.012214,0.124839,0.034244,0.025852,0.124839,-0.026378,-0.034475,0.124839,-0.041404,-0.030625,-0.07102,-0.020922,-0.012917,0.011261,-0.03009,-0.021663,0.031252,-0.100868,-0.035229,-0.021434,0.077904,0.124839,0.124839,0.124839
topic_6,-0.030385,-0.027686,-0.000881,0.022342,-0.013988,1.0,-0.008316,-0.040157,-0.014659,0.060874,0.006307,0.018871,0.004797,0.060874,0.0553,0.060874,0.012394,0.023069,0.024123,-0.006854,-0.034194,-0.020442,0.012871,0.021728,-0.008116,0.028698,0.008954,0.060874,-0.01229,0.00468,0.060874,-0.027641,-0.01924,0.060874,-0.005483,-0.006585,-0.029636,-0.016303,-0.00826,-0.008728,0.016067,-0.004985,0.014626,-0.071984,-0.017078,-0.005176,-0.025603,0.060874,0.060874,0.060874
topic_7,-0.027497,-0.002186,-0.012822,-0.003958,0.008088,-0.008316,1.0,-0.028172,-0.008596,0.031643,-0.005043,0.00715,-0.02723,0.031643,-0.008899,0.031643,-0.019073,-0.003825,-0.006001,-0.016116,0.01648,0.005637,0.008234,-0.002012,-0.007962,0.008635,-0.010206,0.031643,0.003916,-0.014118,0.031643,-0.008392,0.020034,0.031643,-0.018742,-0.022013,-0.044243,0.005354,-0.011441,-0.026895,-0.020176,0.003049,-0.01832,-0.006445,-0.005552,-0.01059,-0.001533,0.031643,0.031643,0.031643
topic_8,-0.012261,-0.0558,-0.063147,-0.057613,-0.087595,-0.040157,-0.028172,1.0,-0.041979,-0.116862,-0.086795,-0.042117,-0.08735,-0.116862,-0.053834,-0.116862,-0.049391,-0.107759,-0.059399,-0.056779,-0.021259,-0.035834,-0.090815,-0.016879,-0.024341,-0.043175,-0.029149,-0.116862,-0.072688,-0.056099,-0.116862,0.111733,-0.054788,-0.116862,0.049954,0.015414,0.075937,0.03059,-0.052575,-0.079729,-0.038093,0.035262,-0.06898,0.056182,-0.035816,-0.02276,-0.105735,-0.116862,-0.116862,-0.116862
topic_9,-0.037748,0.021799,0.027079,-0.01005,-0.014973,-0.014659,-0.008596,-0.041979,1.0,0.052702,0.001424,0.015651,-0.051512,0.052702,0.006266,0.052702,-0.00749,0.017654,-0.000352,0.009334,-0.066482,-0.022816,0.008699,-0.013601,-0.012994,0.000965,-0.008,0.052702,0.011939,0.005254,0.052702,-0.008671,-0.007608,0.052702,-0.024448,0.024467,-0.063568,-0.007269,-0.010952,-0.018828,-0.007935,-0.019354,-0.021563,-0.008646,-0.0172,-0.016013,-0.00702,0.052702,0.052702,0.052702
topic_10,-0.084662,-0.001562,0.076315,0.073499,0.124839,0.060874,0.031643,-0.116862,0.052702,1.0,0.11982,0.119168,-0.069851,1.0,0.071156,1.0,0.085033,0.051579,0.147125,0.033849,-0.190751,-0.056091,0.062484,0.03011,0.059457,0.062848,0.038721,1.0,0.030289,0.114267,1.0,0.009704,0.028523,1.0,-0.013446,0.013352,-0.020341,0.027803,0.078013,0.066321,0.004609,0.013347,0.067049,-0.173803,0.007086,-0.016978,0.085228,1.0,1.0,1.0


In [38]:
output_dir = f"{SENTIMENT_RAW_DATA_DIR}/unified"
with open(f"{output_dir}/domain_treat_control_topics.json", "w") as jsonfile:
    json.dump(treated_control_topics_dict, jsonfile)

split_data(df, f"{output_dir}", "topics", "sentiment_label")