# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install pickle5

# Load Data

In [None]:
import pickle5 as pickle
with open('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/data/NST04_tweet_communities.pickle', 'rb') as f:
    tweet_comm_df = pickle.load(f)
#tweet_comm_df = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/data/NST04_tweet_communities.pickle')

tweet_comm_df.columns

In [None]:
tweet_comm_df[['cat', 'neg', 'pos', 'neu', 'comp', 'TBpol',
       'hfs', 'cdf_neg', 'cdf_neu', 'cdf_pos','page_rank']].head()

In [None]:
tweet_comm_df[['text4']]

In [None]:
# Convert columns to categorical
tweet_comm_df['Piper_cat']=tweet_comm_df['Piper_typ']
for col in ['community', 'Piper_cat', 'cat']:
    tweet_comm_df[col] = tweet_comm_df[col].astype('category')

In [None]:
tweet_comm_df.info()

In [None]:
tweet_comm_df['community'].value_counts()

In [None]:
tweet_comm_df.groupby('community')['incl_assoc_score'].mean().plot(kind='bar')

In [None]:
tweet_comm_df[tweet_comm_df['community'] == 9]['Piper_typ'].value_counts()

Look at counts by community (number of tweets, unique tweets, unique users).
(Note that users can be members of more than one community if they tweet in different styles)

In [None]:
a = tweet_comm_df['community'].value_counts()
b = tweet_comm_df.groupby('community')['text4'].nunique()
c = tweet_comm_df.groupby('community')['user_id'].nunique()
pd.concat([a,b,c], axis=1)

In [None]:
a = tweet_comm_df[(tweet_comm_df['community']==6) & (tweet_comm_df['retweet_count']==2123)][['text4','retweet_count','favorite_count']].sort_values('retweet_count')
b = a.iloc[0,0]
print(b)
a

In [None]:
# why is this tweet repeated without re-tweeting?
tweet_comm_df[(tweet_comm_df['text4'].str.startswith('@user Republicans are against'))][['text4','user_description','retweet_count']].sort_values('retweet_count')

In [None]:
df2 = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_tweets_large_Piper_sentiment.pickle')
df2[(df2['text4'].str.startswith('@user Republicans are against'))][['text4','user_description','created_at','retweet_count']].sort_values('retweet_count').sort_values('created_at').iloc[2,0]

In [None]:
tweet_comm_df[(tweet_comm_df['text4'].str.startswith('@user Republicans are against'))][['text4','user_description','retweet_count']].sort_values('retweet_count')

In [None]:
cols=['incl_affil_score','incl_assoc_score','excl_affil_score','excl_assoc_score','abs_terms_score','is_reply','is_retweet','Piper_typ',
      'cdf_neg','cdf_neu','cdf_pos','retweet_count','favorite_count','page_rank']
tweet_comm_df[cols].groupby(tweet_comm_df['community']).mean()

In [None]:
cols2 = cols.copy()
cols2.append('community')
print(cols2)

In [None]:
tweet_comm_df[cols2].to_csv('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/data/clusters.csv', index=False)

In [None]:
a = tweet_comm_df[['community','page_rank','text4']].sort_values('page_rank').drop_duplicates(['community'], keep='last').sort_values('community')
print(a)
for tw in a.text4:
  print(tw, '\n')

In [None]:
disc_cols = ['incl_affil_score', 'incl_assoc_score', 'excl_affil_score',
             'excl_assoc_score', 'abs_terms_score']

tweet_comm_df[disc_cols].describe().T

In [None]:
community = tweet_comm_df.groupby('community')

In [None]:
community.size()

In [None]:
import numpy as np

community[disc_cols].agg([np.sum])

In [None]:
com_sent_xtb = pd.crosstab(tweet_comm_df[tweet_comm_df['community']!=-1]['community'], tweet_comm_df['cat'], margins=False)

In [None]:
com_type_xtb = pd.crosstab(tweet_comm_df['Piper_typ'], tweet_comm_df[tweet_comm_df['community']!=-1]['community'], margins=False)

display(com_type_xtb)

In [None]:
# drop commnity -1

temp_df = tweet_comm_df.loc[tweet_comm_df['community']!=-1]

com_disc_xtb = temp_df.groupby('community')[disc_cols].sum()

display(com_disc_xtb)

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(com_disc_xtb, cmap="YlGnBu")

In [None]:
# Chi-square test of independence. 
c, p, dof, expected = chi2_contingency(com_type_xtb) # Print the p-value

print('dof=%d' % dof)

# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, c))
if abs(c) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
	# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

In [None]:
# Chi-square test of independence. 
c, p, dof, expected = chi2_contingency(com_sent_xtb) # Print the p-value

print('dof=%d' % dof)

# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, c))
if abs(c) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
	# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')