In [3]:
## basic scikit learn 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


#from sklearn import datasets, linear_model
from sklearn import metrics

# plotting 

import seaborn as sns
sns.set()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 

# nltk 

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia 

# others 
import csv 
import numpy as np
import pandas as pd 
from datetime import datetime
import random
import collections
import string 
import re

import datetime
from utils import *  # imports things like feature_engineering 

# Features to use : 

- sentiment analysis for the text - positive or negative using nltk 
- time of the day 
- length of tweet 
- mention of I /#realDonaldTrump
- number of hashtags
- starts with "@
- number of links/retweets
- the most common @ and links from Donald Trump vs the most common @ from his staff, see if there is a pattern 


In [22]:
alldat = pd.read_csv('./all/train.csv')
data_train_orig = alldat

# shuffling the tweets around and selecting a fraction for holdout testing if needed; default is 1. 

frac = 1

lim = int(data_train_orig.shape[0]*frac)
random_indxs = np.random.permutation(len(data_train_orig))
indxs = random_indxs[0:lim]
indxs_secret = random_indxs[lim:]

data_train = data_train_orig.iloc[indxs]; 
labels_train = alldat['label'].iloc[indxs]; 

data_train_secret = data_train_orig.iloc[indxs_secret]; 
labels_train_secret = alldat['label'].iloc[indxs_secret]; 

data_test =  pd.read_csv('./all/test.csv');

## cleaning up the data 

data_train = feature_engineering(data_train, power=1); 
data_train_secret = feature_engineering(data_train_secret, power=1); 
data_test = feature_engineering(data_test, power=1); 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['Dates'] = pd.to_datetime(data['created']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['Time'] = pd.to_datetime(data['created']).dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['hour'] = data['Time'].astype('str').str.split(':').apply (lambda x: int(x[0]) + int(x[1])/60.0)
A value is trying to be set on a copy of a 

In [23]:
feature_list = ['hour', 'length','nhashs', 'nlinks','nats',  'pos', 'neg', 'neu', 'nself', 'rt', 'ncampaign']

X_train=data_train[feature_list]  # Features
y_train=data_train['label'] # Labels

if frac<1 : 
    
    X_secret=data_train_secret[feature_list]  # Features
    y_secret=data_train_secret['label'] # Labels

X_test=data_test[feature_list]  # Features

# Ensembling 

In [17]:
clf_ada = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)

clf_rfc=RandomForestClassifier(n_estimators=500).fit(X_train, y_train)

clf_gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                     max_depth=1, random_state=0).fit(X_train, y_train)

eclf = VotingClassifier(estimators=[('abc', clf_ada), ('rf', clf_rfc), ('gbc', clf_gbc)], voting='soft').fit(X_train, y_train) # ,voting='soft', weights=[2, 1, 2]

for clf, label in zip([clf_ada, clf_rfc, clf_gbc, eclf], ['Adaboost', 'Random Forest', 'Gradient Boost', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.88 (+/- 0.03) [Adaboost]
Accuracy: 0.89 (+/- 0.02) [Random Forest]
Accuracy: 0.88 (+/- 0.02) [Gradient Boost]
Accuracy: 0.90 (+/- 0.02) [Ensemble]


In [14]:
params = {'rf__n_estimators': [100, 500, 1000],
          'abc__n_estimators':[100, 500, 1000], 
         'gbc__n_estimators':[100, 500, 1000]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X_train, y_train)

In [15]:
grid.best_params_

{'abc__n_estimators': 100, 'gbc__n_estimators': 100, 'rf__n_estimators': 500}

In [19]:
if frac<1: 
    for clf, label in zip([clf_ada, clf_rfc, clf_gbc, eclf], ['Adaboost', 'Random Forest', 'Gradient Boost', 'Ensemble']):
        scores = cross_val_score(clf, X_secret, y_secret, cv=5, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [18]:
y_pred=eclf.predict(X_test)
assert len(y_pred)==300

preds = pd.DataFrame({'ID': np.arange(300), 'label': y_pred})
preds.to_csv('./preds_ag_'+str(datetime.datetime.now())+'.csv', index=None)

In [19]:
# Get numerical feature importances
importances = list(clf_rfc.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: nlinks               Importance: 0.21
Variable: hour                 Importance: 0.15
Variable: length               Importance: 0.12
Variable: rt                   Importance: 0.11
Variable: nhashs               Importance: 0.1
Variable: neu                  Importance: 0.08
Variable: nats                 Importance: 0.06
Variable: pos                  Importance: 0.06
Variable: neg                  Importance: 0.05
Variable: ncampaign            Importance: 0.04
Variable: nself                Importance: 0.02


In [20]:
# Get numerical feature importances
importances = list(clf_ada.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: hour                 Importance: 0.53
Variable: pos                  Importance: 0.11
Variable: neg                  Importance: 0.1
Variable: neu                  Importance: 0.08
Variable: length               Importance: 0.07
Variable: nlinks               Importance: 0.04
Variable: rt                   Importance: 0.04
Variable: nhashs               Importance: 0.02
Variable: ncampaign            Importance: 0.01
Variable: nats                 Importance: 0.0
Variable: nself                Importance: 0.0


In [21]:
# Get numerical feature importances
importances = list(clf_gbc.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: nlinks               Importance: 0.59
Variable: rt                   Importance: 0.19
Variable: hour                 Importance: 0.09
Variable: nhashs               Importance: 0.06
Variable: length               Importance: 0.03
Variable: neg                  Importance: 0.02
Variable: neu                  Importance: 0.01
Variable: nats                 Importance: 0.0
Variable: pos                  Importance: 0.0
Variable: nself                Importance: 0.0
Variable: ncampaign            Importance: 0.0


# Making plots 

In [None]:
## make wordclouds 

all_words = ' '.join(merge_rows(data_train[data_train['label']==1]['hashs']))
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110,colormap='plasma',  background_color="white").generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig('plots/drumpf_wordcloud_hashs.jpeg', format='jpeg', dpi=200)

all_words = ' '.join(merge_rows(data_train[data_train['label']==-1]['hashs']))
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110,colormap='plasma',  background_color="white").generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig('plots/minions_wordcloud_hashs.jpeg',format='jpeg', dpi=200)

## make wordclouds 

all_words = ' '.join(merge_rows(data_train[data_train['label']==1]['ats']))
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, colormap='plasma', background_color="white").generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig('plots/drumpf_wordcloud_ats.jpeg', format='jpeg', dpi=200)

all_words = ' '.join(merge_rows(data_train[data_train['label']==-1]['ats']))
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, colormap='plasma', background_color="white").generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig('plots/minions_wordcloud_ats.jpeg',format='jpeg', dpi=200)

plt.figure()
sns.distplot(data_train[data_train['label']==1]['hour'], kde=True,color = 'blue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Drumpf')
sns.distplot(data_train[data_train['label']==-1]['hour'], kde=True,color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Minions')

plt.legend(prop={'size': 16})
plt.title('Time')
plt.xlabel('Time')
plt.ylabel('Density')
plt.savefig('plots/time_density.jpeg', format='jpeg', dpi=200)



plt.figure()
sns.distplot(data_train[data_train['label']==1]['length'], kde=True,color = 'blue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Drumpf')
sns.distplot(data_train[data_train['label']==-1]['length'], kde=True,color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Minions')

plt.legend(prop={'size': 16})
plt.title('neu')
plt.xlabel('Time')
plt.ylabel('Density')
plt.savefig('plots/length_density.jpeg', format='jpeg', dpi=200)

plt.figure()
sns.distplot(data_train[data_train['label']==1]['pos'], kde=True,color = 'blue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Drumpf')
sns.distplot(data_train[data_train['label']==-1]['pos'], kde=True,color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Minions')

plt.legend(prop={'size': 16})
plt.title('Positive emotions')
plt.xlabel('Positivity')
plt.ylabel('Density')

plt.savefig('plots/pos_density.jpeg', format='jpeg', dpi=200)

plt.figure()
sns.distplot(data_train[data_train['label']==1]['neg'], kde=True,color = 'blue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Drumpf')
sns.distplot(data_train[data_train['label']==-1]['neg'], kde=True,color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Minions')

plt.legend(prop={'size': 16})
plt.title('Negative emotions')
plt.xlabel('Negativity')
plt.ylabel('Density')

plt.savefig('plots/neg_density.jpeg', format='jpeg', dpi=200)

plt.figure()
sns.distplot(data_train[data_train['label']==1]['neu'], kde=True,color = 'blue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Drumpf')
sns.distplot(data_train[data_train['label']==-1]['neu'], kde=True,color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4},label='Minions')

plt.legend(prop={'size': 16})
plt.title('neu')
plt.xlabel('Neutrality')
plt.ylabel('Density')

plt.savefig('plots/neu_density.jpeg', format='jpeg', dpi=200)


## Difference in counted features 

In [46]:
# is there a difference in number of links/hashtags used? 

hashs_drumpf = np.sum(gethashs(data_train[data_train['label'] == 1]['text']))
hashs_minions = np.sum(gethashs(data_train[data_train['label'] == -1]['text']))

links_drumpf = np.sum(getlinks(data_train[data_train['label'] == 1]['text']))
links_minions = np.sum(getlinks(data_train[data_train['label'] == -1]['text']))

ats_drumpf = np.sum(getats(data_train[data_train['label'] == 1]['text']))
ats_minions = np.sum(getats(data_train[data_train['label'] == -1]['text']))

allcaps_drumpf = np.sum(data_train[data_train['label'] == 1]['ncaps'])
allcaps_minions = np.sum(data_train[data_train['label'] == -1]['ncaps'])

children_drumpf = len(np.sum(getchildren(data_train[data_train['label'] == 1]['text'])))
children_minions = len(np.sum(getchildren(data_train[data_train['label'] == -1]['text'])))

self_drumpf = np.sum(data_train['nself'][data_train['label'] == 1])
self_minions = np.sum(data_train['nself'][data_train['label'] == -1])

camp_drumpf = np.sum(data_train['ncampaign'][data_train['label'] == 1])
camp_minions = np.sum(data_train['ncampaign'][data_train['label'] == -1])

## total number of hashtags and links 
print("Type  ", ' D ', 'M')
print("------------------")
print('hashs: ', len(hashs_drumpf), len(hashs_minions))
print('links: ', len(links_drumpf), len(links_minions))
print('handles: ', len(ats_drumpf), len(ats_minions))
print('allcaps: ', allcaps_drumpf, allcaps_minions)
print('children: ', children_drumpf, children_minions)
print('self: ', self_drumpf, self_minions)
print('camp: ', camp_drumpf, camp_minions)

Type    D  M
------------------
hashs:  83 449
links:  55 418
handles:  594 123
allcaps:  414 229
children:  69 168
self:  119 1
camp:  16 168
