In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats

file_name = "data/output_tweets/sentiment_scored_english_tweets.csv"
print("Reading " + file_name + "\n");
data = pd.read_csv(file_name)
del data['Unnamed: 0']
print("Data shape : ")
print(data.shape)

print("Reading locations' mapping\n");
loc_to_canton = pd.read_csv('data/location_to_canton.csv')
del loc_to_canton['Unnamed: 0']
loc_to_canton.columns = [['canton','source_location']]
print(loc_to_canton.shape[0])
loc_to_canton.head()

Reading data/output_tweets/sentiment_scored_english_tweets.csv

Data shape : 
(85405, 9)
Reading locations' mapping

84


Unnamed: 0,canton,source_location
0,ZH,Affoltern am Albis
1,ZH,Zuerich
2,ZH,Zurigo
3,ZH,Uster
4,ZH,Winterthur


In [3]:
print("Percentage of unrepresented cantons in the data :");
not_represented_cantons = loc_to_canton[loc_to_canton.source_location.isnull()]
print(not_represented_cantons.shape[0]/loc_to_canton.shape[0])
not_represented_cantons

Percentage of unrepresented cantons in the data :
0.023809523809523808


Unnamed: 0,canton,source_location
32,GL,
33,ZG,


In [4]:
data.head()

Unnamed: 0,lang,main,published,sentiment,source_location,main_tk,meaningful_main,clean_main_polarity,vader_polarity
0,en,@ZHA_News https://youtu.be/UxQmxIm4q1s Rest i...,2016-04-01T02:35:56Z,POSITIVE,Lucerna,"['https://youtu.be/UxQmxIm4q1s', 'Rest', 'in',...","['https://youtu.be/uxqmxim4q1s', 'rest', 'peac...",0.071428,0.8856
1,en,Come and join me at our testing Roadshow on Ap...,2016-04-01T14:42:21Z,POSITIVE,Ennetbürgen,"['Come', 'and', 'join', 'me', 'at', 'our', 'te...","['com', 'join', 'test', 'roadshow', 'april', '...",0.071428,0.7644
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,"['Bad', 'Apple', '!', 'pic.twitter.com/Xad2aVO...","['bad', 'appl', '!', 'pic.twitter.com/xad2avoh...",-0.249994,-0.5848
3,en,Number crunching for the past week - 1 new unf...,2016-04-01T11:43:27Z,NEUTRAL,Arth,"['Number', 'crunching', 'for', 'the', 'past', ...","['numb', 'crunch', 'past', 'week', '-', '1', '...",-0.090908,0.0772
4,en,Just posted a photo @ Langnau im Emmental http...,2016-04-01T06:31:13Z,NEUTRAL,Langnau,"['Just', 'posted', 'a', 'photo', '@', 'Langnau...","['just', 'post', 'photo', '@', 'langnau', 'im'...",0.0,0.0


In [5]:
data_new = pd.merge(data, loc_to_canton, on='source_location', how='left')
data_new.head()

Unnamed: 0,lang,main,published,sentiment,source_location,main_tk,meaningful_main,clean_main_polarity,vader_polarity,canton
0,en,@ZHA_News https://youtu.be/UxQmxIm4q1s Rest i...,2016-04-01T02:35:56Z,POSITIVE,Lucerna,"['https://youtu.be/UxQmxIm4q1s', 'Rest', 'in',...","['https://youtu.be/uxqmxim4q1s', 'rest', 'peac...",0.071428,0.8856,LU
1,en,Come and join me at our testing Roadshow on Ap...,2016-04-01T14:42:21Z,POSITIVE,Ennetbürgen,"['Come', 'and', 'join', 'me', 'at', 'our', 'te...","['com', 'join', 'test', 'roadshow', 'april', '...",0.071428,0.7644,NW
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,"['Bad', 'Apple', '!', 'pic.twitter.com/Xad2aVO...","['bad', 'appl', '!', 'pic.twitter.com/xad2avoh...",-0.249994,-0.5848,
3,en,Number crunching for the past week - 1 new unf...,2016-04-01T11:43:27Z,NEUTRAL,Arth,"['Number', 'crunching', 'for', 'the', 'past', ...","['numb', 'crunch', 'past', 'week', '-', '1', '...",-0.090908,0.0772,SZ
4,en,Just posted a photo @ Langnau im Emmental http...,2016-04-01T06:31:13Z,NEUTRAL,Langnau,"['Just', 'posted', 'a', 'photo', '@', 'Langnau...","['just', 'post', 'photo', '@', 'langnau', 'im'...",0.0,0.0,BE


In [7]:
not_mapped_data = data_new[(data_new.canton).isnull()]
percentage_not_mapped = not_mapped_data.shape[0] / data_new.shape[0]
print("Size of unmapped data : ")
print(not_mapped_data.shape[0])
print("Percentage of unmapped data : ")
print(percentage_not_mapped)
not_mapped_data.head()

Size of unmapped data : 
56476
Percentage of unmapped data : 
0.6612727592061355


Unnamed: 0,lang,main,published,sentiment,source_location,main_tk,meaningful_main,clean_main_polarity,vader_polarity,canton
2,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T21:50:50Z,NEGATIVE,Suica,"['Bad', 'Apple', '!', 'pic.twitter.com/Xad2aVO...","['bad', 'appl', '!', 'pic.twitter.com/xad2avoh...",-0.249994,-0.5848,
16,en,@detravoir Might this confirm the prophecy? pi...,2016-04-01T17:49:36Z,NEUTRAL,Confoederatio Helvetica,"['Might', 'this', 'confirm', 'the', 'prophecy'...","['might', 'confirm', 'prophecy', '?', 'pic.twi...",0.0,0.0,
17,en,Bad Apple! pic.twitter.com/Xad2aVOHFd,2016-04-01T20:51:10Z,NEGATIVE,Suica,"['Bad', 'Apple', '!', 'pic.twitter.com/Xad2aVO...","['bad', 'appl', '!', 'pic.twitter.com/xad2avoh...",-0.249994,-0.5848,
18,en,Did some #GreenDay #acoustic. 21 Guns. #rock #...,2016-04-01T20:27:24Z,POSITIVE,Confoederatio Helvetica,"['Did', 'some', '#GreenDay', '#acoustic', '.',...","['did', '#greenday', '#acoustic', '21', 'gun',...",0.0,0.0,
20,en,@BlizzHeroes @DustinBrowder http://imgur.com/y...,2016-04-01T00:23:23Z,NEGATIVE,Confoederatio Helvetica,"['http://imgur.com/yX89HVR', '...', 'why', '?'...","['http://imgur.com/yx89hvr', '...', '?', 'yo',...",-0.142855,-0.4939,


In [8]:
# drop unmapped data
mapped_data = data_new.dropna(subset= ['canton'], how='all')
# take only relevent features
data_sent_canton = mapped_data[['canton','sentiment','clean_main_polarity','vader_polarity','published']]
print("Final data size : ")
print(data_sent_canton.shape[0])
data_sent_canton.head()

Final data size : 
28929


Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,published
0,LU,POSITIVE,0.071428,0.8856,2016-04-01T02:35:56Z
1,NW,POSITIVE,0.071428,0.7644,2016-04-01T14:42:21Z
3,SZ,NEUTRAL,-0.090908,0.0772,2016-04-01T11:43:27Z
4,BE,NEUTRAL,0.0,0.0,2016-04-01T06:31:13Z
5,GR,NEUTRAL,-0.181817,0.0,2016-04-01T13:42:13Z


In [9]:
def sentiment_value(name):
    if name == 'NEUTRAL':
        val = 0
    elif name == 'POSITIVE':
        val = 1
    elif name == 'NEGATIVE':
        val = -1
    else :
        val = np.NaN
    return val

In [10]:
data_sent_canton['sentiment'] = [sentiment_value(a) for a in data_sent_canton['sentiment']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
data_sent_canton = data_sent_canton.dropna()

In [15]:
print(data_sent_canton.shape[0])
data_sent_canton.head()

28929


Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,published
0,LU,1,0.071428,0.8856,2016-04-01T02:35:56Z
1,NW,1,0.071428,0.7644,2016-04-01T14:42:21Z
3,SZ,0,-0.090908,0.0772,2016-04-01T11:43:27Z
4,BE,0,0.0,0.0,2016-04-01T06:31:13Z
5,GR,0,-0.181817,0.0,2016-04-01T13:42:13Z


In [16]:
data_sent_canton['published'] = [a.split('T')[0] for a in data_sent_canton['published']]

In [17]:
data_sent_canton=data_sent_canton.rename(columns = {'published':'day'})
data_sent_canton.head()

Unnamed: 0,canton,sentiment,clean_main_polarity,vader_polarity,day
0,LU,1,0.071428,0.8856,2016-04-01
1,NW,1,0.071428,0.7644,2016-04-01
3,SZ,0,-0.090908,0.0772,2016-04-01
4,BE,0,0.0,0.0,2016-04-01
5,GR,0,-0.181817,0.0,2016-04-01


In [37]:
data_day = data_sent_canton.groupby(['day'])
for name, group in data_day:
        print(name)
        #print(group)
data_day.describe()

2016-04-01
2016-04-02
2016-04-03
2016-04-04
2016-04-05
2016-04-06
2016-04-07
2016-04-08
2016-04-09
2016-04-10


Unnamed: 0_level_0,Unnamed: 1_level_0,clean_main_polarity,sentiment,vader_polarity
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-04-01,count,3484.000000,3484.000000,3484.000000
2016-04-01,mean,0.008248,0.185132,0.137028
2016-04-01,std,0.097688,0.698135,0.380699
2016-04-01,min,-0.999900,-1.000000,-0.953100
2016-04-01,25%,0.000000,0.000000,0.000000
2016-04-01,50%,0.000000,0.000000,0.000000
2016-04-01,75%,0.000000,1.000000,0.440400
2016-04-01,max,0.999950,1.000000,0.967000
2016-04-02,count,2401.000000,2401.000000,2401.000000
2016-04-02,mean,0.005107,0.197418,0.134885


In [63]:
# Statistical test : day 1
data_single_day = data_day.get_group('2016-04-01')
data_single_day_gp = data_single_day.groupby('canton')
# p_values
data_single_day_p_values = pd.DataFrame(columns = ['canton','sentiment_pv','vader_pv','main_pv'])
for i,group in enumerate( data_single_day_gp.groups):
    
    data_c = data_single_day_gp.get_group(group)
    pv_sentiment = stats.ttest_ind(a = data_c.sentiment, b= data_single_day.sentiment, equal_var=False)[1]
    pv_vader = (stats.ttest_ind(a = data_c.vader_polarity, b= data_single_day.vader_polarity, equal_var=False))[1]
    pv_main = (stats.ttest_ind(a = data_c.clean_main_polarity, b= data_single_day.clean_main_polarity, equal_var=False))[1]
    data_single_day_p_values.loc[i] = [group, pv_sentiment, pv_vader, pv_main]
data_single_day_p_values = data_single_day_p_values.set_index('canton')
# mean
data_single_day_mean = data_single_day_gp.mean()
data_single_day_mean.columns = ['main_mean','sentiment_mean','vader_mean']
# std
data_single_day_std = data_single_day_gp.std()
data_single_day_std.columns = ['main_std','sentiment_std','vader_std']
# count
data_single_day_count = pd.DataFrame(data_single_day_gp.count()['sentiment'])
data_single_day_count.columns = ['count']
# concatenation
data_single_day = pd.concat([data_single_day_mean, data_single_day_std, data_single_day_p_values, data_single_day_count], axis = 1, join = 'inner')
data_single_day = data_single_day.reset_index()
data_single_day.head()



Unnamed: 0,canton,main_mean,sentiment_mean,vader_mean,main_std,sentiment_std,vader_std,sentiment_pv,vader_pv,main_pv,count
0,BE,0.010975,0.158416,0.122261,0.113088,0.634539,0.368261,0.563529,0.5807815,0.737591,202
1,NW,0.071428,1.0,0.7644,,,,,,,1
2,FR,0.025523,0.24,0.111044,0.096528,0.597216,0.35307,0.65161,0.717186,0.381363,25
3,VD,-0.001714,0.164062,0.124496,0.081044,0.696191,0.397402,0.737204,0.726196,0.177596,128
4,GE,-0.001956,0.069523,0.067172,0.097995,0.726309,0.392874,1e-06,6.796775e-08,0.001662,1237


In [64]:
def day_preprocess(day):
    data_single_day = data_day.get_group(day)
    data_single_day_gp = data_single_day.groupby('canton')
    # p_values
    data_single_day_p_values = pd.DataFrame(columns = ['canton','sentiment_pv','vader_pv','main_pv'])
    for i,group in enumerate( data_single_day_gp.groups):
        data_c = data_single_day_gp.get_group(group)
        pv_sentiment = stats.ttest_ind(a = data_c.sentiment, b= data_single_day.sentiment, equal_var=False)[1]
        pv_vader = (stats.ttest_ind(a = data_c.vader_polarity, b= data_single_day.vader_polarity, equal_var=False))[1]
        pv_main = (stats.ttest_ind(a = data_c.clean_main_polarity, b= data_single_day.clean_main_polarity, equal_var=False))[1]
        data_single_day_p_values.loc[i] = [group, pv_sentiment, pv_vader, pv_main]
    data_single_day_p_values = data_single_day_p_values.set_index('canton')
    # mean
    data_single_day_mean = data_single_day_gp.mean()
    data_single_day_mean.columns = ['main_mean','sentiment_mean','vader_mean']
    # std
    data_single_day_std = data_single_day_gp.std()
    data_single_day_std.columns = ['main_std','sentiment_std','vader_std']
    # count
    data_single_day_count = pd.DataFrame(data_single_day_gp.count()['sentiment'])
    data_single_day_count.columns = ['count']
    # concatenation
    data_single_day = pd.concat([data_single_day_mean, data_single_day_std, data_single_day_p_values, data_single_day_count], axis = 1, join = 'inner')
    data_single_day = data_single_day.reset_index()
    return data_single_day

In [68]:
prefix="viz-data/__harvest3r_twitter_data_"
postfix = "-04_0.json"
for i in np.arange(10):
    if (i<9):
        day = '2016-04-0' + str(i+1)
    else : 
        day = '2016-04-' + str(i+1)
    print("Pre-processing " + day + "\n");
    data_sd = day_preprocess(day)
    print(data_sd.head())
    print("write json file " + day + "\n");
    if (i<9):
        day = '0' + str(i+1)
    else : 
        day = str(i+1)
    file_name = prefix + day + postfix
    print(file_name)
    data_sd.to_json(file_name)

Pre-processing 2016-04-01





  canton  main_mean  sentiment_mean  vader_mean  main_std  sentiment_std  \
0     BE   0.010975        0.158416    0.122261  0.113088       0.634539   
1     NW   0.071428        1.000000    0.764400       NaN            NaN   
2     FR   0.025523        0.240000    0.111044  0.096528       0.597216   
3     VD  -0.001714        0.164062    0.124496  0.081044       0.696191   
4     GE  -0.001956        0.069523    0.067172  0.097995       0.726309   

   vader_std  sentiment_pv      vader_pv   main_pv  count  
0   0.368261      0.563529  5.807815e-01  0.737591    202  
1        NaN           NaN           NaN       NaN      1  
2   0.353070      0.651610  7.171860e-01  0.381363     25  
3   0.397402      0.737204  7.261960e-01  0.177596    128  
4   0.392874      0.000001  6.796775e-08  0.001662   1237  
write json file 2016-04-01

viz-data/__harvest3r_twitter_data_01-04_0.json
Pre-processing 2016-04-02

  canton  main_mean  sentiment_mean  vader_mean  main_std  sentiment_std  \
0    

In [None]:
#statistical significance

import scipy.stats as stats

# day 
data_day1 = data_sent_canton[data_sent_canton.day == '2016-04-01']
#print(data_day1)
data_day2 = data_sent_canton[data_sent_canton.day == '2016-04-02']
#print(data_day2)
data_day3 = data_sent_canton[data_sent_canton.day == '2016-04-03']
#print(data_day3)
data_day4 = data_sent_canton[data_sent_canton.day == '2016-04-04']
#print(data_day4)
data_day5 = data_sent_canton[data_sent_canton.day == '2016-04-05']
#print(data_day5)

stats.friedmanchisquare(data_day1,data_day2,data_day3,data_day4,data_day5)

In [None]:
# canton