In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

file_name = "data/output_tweets/sentiment_scored_english_tweets.csv"
print("Reading " + file_name + "\n");
data = pd.read_csv(file_name)
del data['Unnamed: 0']
print("Data shape : ")
print(data.shape)

print("Reading locations' mapping\n");
loc_to_canton = pd.read_csv('data/location_to_canton.csv')
del loc_to_canton['Unnamed: 0']
loc_to_canton.columns = [['canton','source_location']]
print(loc_to_canton.shape[0])
loc_to_canton.head()

Reading data/output_tweets/sentiment_scored_english_tweets.csv

Data shape : 
(263884, 7)
Reading locations' mapping

84


Unnamed: 0,canton,source_location
0,ZH,Affoltern am Albis
1,ZH,Zuerich
2,ZH,Zurigo
3,ZH,Uster
4,ZH,Winterthur


In [2]:
print("Percentage of unrepresented cantons in the data :");
not_represented_cantons = loc_to_canton[loc_to_canton.source_location.isnull()]
print(not_represented_cantons.shape[0]/loc_to_canton.shape[0])
not_represented_cantons

Percentage of unrepresented cantons in the data :
0.023809523809523808


Unnamed: 0,canton,source_location
32,GL,
33,ZG,


In [3]:
data.head()

Unnamed: 0,lang,main,published,sentiment,source_location,clean_main_polarity,vader_polarity
0,en,@ZHA_News https://youtu.be/UxQmxIm4q1s Rest i...,2016-04-01T02:35:56Z,POSITIVE,Lucerna,0.071428,0.8856
1,en,Come and join me at our testing Roadshow on Ap...,2016-04-01T14:42:21Z,POSITIVE,Ennetbürgen,0.071428,0.7644
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,-0.249994,-0.5848
3,en,Number crunching for the past week - 1 new unf...,2016-04-01T11:43:27Z,NEUTRAL,Arth,-0.090908,0.0772
4,en,Just posted a photo @ Langnau im Emmental http...,2016-04-01T06:31:13Z,NEUTRAL,Langnau,0.0,0.0


In [4]:
data_new = pd.merge(data, loc_to_canton, on='source_location', how='left')
data_new.head()

Unnamed: 0,lang,main,published,sentiment,source_location,clean_main_polarity,vader_polarity,canton
0,en,@ZHA_News https://youtu.be/UxQmxIm4q1s Rest i...,2016-04-01T02:35:56Z,POSITIVE,Lucerna,0.071428,0.8856,LU
1,en,Come and join me at our testing Roadshow on Ap...,2016-04-01T14:42:21Z,POSITIVE,Ennetbürgen,0.071428,0.7644,NW
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,-0.249994,-0.5848,
3,en,Number crunching for the past week - 1 new unf...,2016-04-01T11:43:27Z,NEUTRAL,Arth,-0.090908,0.0772,SZ
4,en,Just posted a photo @ Langnau im Emmental http...,2016-04-01T06:31:13Z,NEUTRAL,Langnau,0.0,0.0,BE


In [5]:
not_mapped_data = data_new[(data_new.canton).isnull()]
percentage_not_mapped = not_mapped_data.shape[0] / data_new.shape[0]
print("Size of unmapped data : ")
print(not_mapped_data.shape[0])
print("Percentage of unmapped data : ")
print(percentage_not_mapped)
not_mapped_data.head()

Size of unmapped data : 
171740
Percentage of unmapped data : 
0.6508162677540131


Unnamed: 0,lang,main,published,sentiment,source_location,clean_main_polarity,vader_polarity,canton
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,-0.249994,-0.5848,
16,en,@detravoir Might this confirm the prophecy? pi...,2016-04-01T17:49:36Z,NEUTRAL,Confoederatio Helvetica,0.0,0.0,
17,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T20:51:10Z,NEGATIVE,Suica,-0.249994,-0.5848,
18,en,Did some #GreenDay #acoustic. 21 Guns. #rock #...,2016-04-01T20:27:24Z,POSITIVE,Confoederatio Helvetica,0.0,0.0,
20,en,@BlizzHeroes @DustinBrowder http://imgur.com/y...,2016-04-01T00:23:23Z,NEGATIVE,Confoederatio Helvetica,-0.142855,-0.4939,


In [6]:
# drop unmapped data
mapped_data = data_new.dropna(subset= ['canton'], how='all')
# take only relevent features
data_sent_canton = mapped_data[['canton','sentiment','clean_main_polarity','vader_polarity','published']]
print("Final data size : ")
print(data_sent_canton.shape[0])
data_sent_canton.head()

Final data size : 
92144


Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,published
0,LU,POSITIVE,0.071428,0.8856,2016-04-01T02:35:56Z
1,NW,POSITIVE,0.071428,0.7644,2016-04-01T14:42:21Z
3,SZ,NEUTRAL,-0.090908,0.0772,2016-04-01T11:43:27Z
4,BE,NEUTRAL,0.0,0.0,2016-04-01T06:31:13Z
5,GR,NEUTRAL,-0.181817,0.0,2016-04-01T13:42:13Z


In [7]:
def sentiment_value(name):
    if name == 'NEUTRAL':
        val = 0
    elif name == 'POSITIVE':
        val = 1
    elif name == 'NEGATIVE':
        val = -1
    else :
        val = np.NaN
    return val

In [8]:
data_sent_canton['sentiment'] = [sentiment_value(a) for a in data_sent_canton['sentiment']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
data_sent_canton = data_sent_canton.dropna()

In [10]:
print(data_sent_canton.shape[0])
data_sent_canton.head()

92144


Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,published
0,LU,1,0.071428,0.8856,2016-04-01T02:35:56Z
1,NW,1,0.071428,0.7644,2016-04-01T14:42:21Z
3,SZ,0,-0.090908,0.0772,2016-04-01T11:43:27Z
4,BE,0,0.0,0.0,2016-04-01T06:31:13Z
5,GR,0,-0.181817,0.0,2016-04-01T13:42:13Z


In [11]:
data_sent_canton['published'] = [a.split('T')[0] for a in data_sent_canton['published']]

In [12]:
data_sent_canton=data_sent_canton.rename(columns = {'published':'day'})
data_sent_canton.head()

Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,day
0,LU,1,0.071428,0.8856,2016-04-01
1,NW,1,0.071428,0.7644,2016-04-01
3,SZ,0,-0.090908,0.0772,2016-04-01
4,BE,0,0.0,0.0,2016-04-01
5,GR,0,-0.181817,0.0,2016-04-01


In [13]:
data_day = data_sent_canton.groupby(['day'])
for name, group in data_day:
        print(name)
        #print(group)
data_day.describe()

2016-04-01
2016-04-02
2016-04-03
2016-04-04
2016-04-05
2016-04-06
2016-04-07
2016-04-08
2016-04-09
2016-04-10
2016-04-11
2016-04-12
2016-04-13
2016-04-14
2016-04-15
2016-04-16
2016-04-17
2016-04-18
2016-04-19
2016-04-20
2016-04-21
2016-04-22
2016-04-23
2016-04-24
2016-04-25
2016-04-26
2016-04-27
2016-04-28
2016-04-29
2016-04-30


Unnamed: 0_level_0,Unnamed: 1_level_0,clean_main_polarity,sentiment,vader_polarity
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-04-01,count,3484.000000,3484.000000,3484.000000
2016-04-01,mean,0.018253,0.185132,0.137028
2016-04-01,std,0.129140,0.698135,0.380699
2016-04-01,min,-0.999975,-1.000000,-0.953100
2016-04-01,25%,0.000000,0.000000,0.000000
2016-04-01,50%,0.000000,0.000000,0.000000
2016-04-01,75%,0.000000,1.000000,0.440400
2016-04-01,max,1.000000,1.000000,0.967000
2016-04-02,count,2401.000000,2401.000000,2401.000000
2016-04-02,mean,0.015171,0.197418,0.134885


In [14]:
# Statistical test : day 1
data_single_day = data_day.get_group('2016-04-01')
data_single_day_gp = data_single_day.groupby('canton')
# p_values
data_single_day_p_values = pd.DataFrame(columns = ['canton','sentiment_pv','vader_pv','main_pv'])
for i,group in enumerate( data_single_day_gp.groups):
    
    data_c = data_single_day_gp.get_group(group)
    pv_sentiment = stats.ttest_ind(a = data_c.sentiment, b= data_single_day.sentiment, equal_var=False)[1]
    pv_vader = (stats.ttest_ind(a = data_c.vader_polarity, b= data_single_day.vader_polarity, equal_var=False))[1]
    pv_main = (stats.ttest_ind(a = data_c.clean_main_polarity, b= data_single_day.clean_main_polarity, equal_var=False))[1]
    data_single_day_p_values.loc[i] = [group, pv_sentiment, pv_vader, pv_main]
data_single_day_p_values = data_single_day_p_values.set_index('canton')
# mean
data_single_day_mean = data_single_day_gp.mean()
data_single_day_mean.columns = ['main_mean','sentiment_mean','vader_mean']
# std
data_single_day_std = data_single_day_gp.std()
data_single_day_std.columns = ['main_std','sentiment_std','vader_std']
# count
data_single_day_count = pd.DataFrame(data_single_day_gp.count()['sentiment'])
data_single_day_count.columns = ['count']
# concatenation
data_single_day = pd.concat([data_single_day_mean, data_single_day_std, data_single_day_p_values, data_single_day_count], axis = 1, join = 'inner')
data_single_day = data_single_day.reset_index()
data_single_day



Unnamed: 0,canton,main_mean,sentiment_mean,vader_mean,main_std,sentiment_std,vader_std,sentiment_pv,vader_pv,main_pv,count
0,BE,0.018348,0.158416,0.122261,0.122501,0.634539,0.368261,0.563529,0.5807815,0.9914944,202
1,NE,0.017424,0.6,0.25529,0.036777,0.516398,0.281344,0.031782,0.2172487,0.9455813,10
2,SZ,-0.050505,-0.25,-0.0908,0.058898,0.5,0.235891,0.180085,0.1488004,0.1013397,4
3,GE,0.003031,0.069523,0.067172,0.11537,0.726309,0.392874,1e-06,6.796775e-08,0.0001160781,1237
4,AR,0.148861,0.25,0.490125,0.235173,0.957427,0.248201,0.900814,0.0651564,0.3477159,4
5,FR,0.025523,0.24,0.111044,0.096528,0.597216,0.35307,0.65161,0.717186,0.7114765,25
6,JU,0.0,0.0,0.0,,,,,,,1
7,SH,0.0,0.2,0.1009,0.0,0.447214,0.475233,0.944383,0.8733116,1.032093e-16,5
8,LU,0.031344,0.277778,0.282889,0.187061,0.741085,0.366652,0.460162,0.02310068,0.6778433,36
9,TG,-0.142855,0.0,0.0,,,,,,,1


In [15]:
def day_preprocess(day):
    data_single_day = data_day.get_group(day)
    data_single_day_gp = data_single_day.groupby('canton')
    # p_values
    data_single_day_p_values = pd.DataFrame(columns = ['canton','sentiment_pv','vader_pv','main_pv'])
    for i,group in enumerate( data_single_day_gp.groups):
        data_c = data_single_day_gp.get_group(group)
        pv_sentiment = stats.ttest_ind(a = data_c.sentiment, b= data_single_day.sentiment, equal_var=False)[1]
        pv_vader = (stats.ttest_ind(a = data_c.vader_polarity, b= data_single_day.vader_polarity, equal_var=False))[1]
        pv_main = (stats.ttest_ind(a = data_c.clean_main_polarity, b= data_single_day.clean_main_polarity, equal_var=False))[1]
        data_single_day_p_values.loc[i] = [group, pv_sentiment, pv_vader, pv_main]
    data_single_day_p_values = data_single_day_p_values.set_index('canton')
    # mean
    data_single_day_mean = data_single_day_gp.mean()
    data_single_day_mean.columns = ['main_mean','sentiment_mean','vader_mean']
    # std
    data_single_day_std = data_single_day_gp.std()
    data_single_day_std.columns = ['main_std','sentiment_std','vader_std']
    # count
    data_single_day_count = pd.DataFrame(data_single_day_gp.count()['sentiment'])
    data_single_day_count.columns = ['count']
    # concatenation
    data_single_day = pd.concat([data_single_day_mean, data_single_day_std, data_single_day_p_values, data_single_day_count], axis = 1, join = 'inner')
    data_single_day = data_single_day.reset_index()
    return data_single_day

In [16]:
prefix="viz-data/__harvest3r_twitter_data_"
postfix = "-04_0.json"
for i in np.arange(30):
    if (i<9):
        day = '2016-04-0' + str(i+1)
    else : 
        day = '2016-04-' + str(i+1)
    print("Pre-processing " + day + "\n");
    data_sd = day_preprocess(day)
    print(data_sd.head())
    print("write json file " + day + "\n");
    if (i<9):
        day = '0' + str(i+1)
    else : 
        day = str(i+1)
    file_name = prefix + day + postfix
    print(file_name)
    data_sd.to_json(file_name)

Pre-processing 2016-04-01

  canton  main_mean  sentiment_mean  vader_mean  main_std  sentiment_std  \
0     BE   0.018348        0.158416    0.122261  0.122501       0.634539   
1     NE   0.017424        0.600000    0.255290  0.036777       0.516398   
2     SZ  -0.050505       -0.250000   -0.090800  0.058898       0.500000   
3     GE   0.003031        0.069523    0.067172  0.115370       0.726309   
4     AR   0.148861        0.250000    0.490125  0.235173       0.957427   

   vader_std  sentiment_pv      vader_pv   main_pv  count  
0   0.368261      0.563529  5.807815e-01  0.991494    202  
1   0.281344      0.031782  2.172487e-01  0.945581     10  
2   0.235891      0.180085  1.488004e-01  0.101340      4  
3   0.392874      0.000001  6.796775e-08  0.000116   1237  
4   0.248201      0.900814  6.515640e-02  0.347716      4  
write json file 2016-04-01

viz-data/__harvest3r_twitter_data_01-04_0.json
Pre-processing 2016-04-02

  canton  main_mean  sentiment_mean  vader_mean  main_



  canton  main_mean  sentiment_mean  vader_mean  main_std  sentiment_std  \
0     BE   0.018687        0.182353    0.121654  0.117797       0.667713   
1     NE   0.019231        0.500000    0.412850  0.027196       0.707107   
2     VS   0.056251        0.400000    0.272680  0.109930       0.699206   
3     SO   0.034285        0.000000    0.265440  0.048022       1.000000   
4     GE   0.026009        0.179671    0.152202  0.120938       0.720823   

   vader_std  sentiment_pv  vader_pv   main_pv  count  
0   0.351822      0.341543  0.245072  0.999556    170  
1   0.583858      0.687790  0.643865  0.981863      2  
2   0.349670      0.470241  0.313288  0.308808     10  
3   0.430713      0.629928  0.595238  0.509385      5  
4   0.390350      0.049565  0.883147  0.111335    974  
write json file 2016-04-03

viz-data/__harvest3r_twitter_data_03-04_0.json
Pre-processing 2016-04-04

  canton  main_mean  sentiment_mean  vader_mean  main_std  sentiment_std  \
0     BE   0.017711        0.