In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from textblob import TextBlob, Word
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

%matplotlib inline

In [18]:
twitch = pd.read_csv('project-query-oct-16.csv')

binary_mapping = {
           'community': 1,
           'non_community': 0}

twitch['community'] = twitch['preliminary_tag'].map(binary_mapping)

twitch.head(5)


Unnamed: 0,time,day_of_week,city,country,channel,status,game,avg_ccu,avg_chat_activity,preliminary_tag,community
0,6/26/2016 13:31,0,San Francisco,US,faceittv,ECS Season 1 Wembley Finals - Luminosity vs. G...,Counter-Strike: Global Offensive,124222,863,non_community,0.0
1,7/5/2016 22:56,2,San Francisco,US,gamesdonequick,SGDQ 2016 benefitting Doctors Without Borders ...,DinoCity,84981,474,non_community,0.0
2,7/4/2016 0:54,1,San Francisco,US,gamesdonequick,SGDQ 2016 benefitting Doctors Without Borders ...,Crystal's Pony Tale,79915,317,non_community,0.0
3,7/24/2016 12:14,0,Boardman,US,nalcs2,NA LCS: TSM vs. Team Envy,League of Legends,78054,318,non_community,0.0
4,6/25/2016 15:01,6,San Francisco,US,nalcs1,NA LCS: Liquid vs. Immortals,League of Legends,60552,138,non_community,0.0


In [19]:
twitch = twitch.dropna()
twitch.shape

print 'Null Accuracy: ', (4937-279)/4937.0

Null Accuracy:  0.943487948147


In [20]:
from sklearn.base import TransformerMixin

class MyCountVectorizer(TransformerMixin):
    def transform(self, X, **transform_params): 
        return self.vec.transform(X['status'])

    def fit(self, X, y=None, **fit_params):
        self.vec = CountVectorizer()
        self.vec.fit(X['status'])
        return self




In [21]:
from sklearn.base import TransformerMixin

class TwitchTransformer(TransformerMixin):
    def transform(self, X, **transform_params): 
        game_dummies = pd.get_dummies(X['game'], prefix='game_').iloc[:, 1:]
        new_X = pd.concat([X, game_dummies], axis = 1)

        # concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
        channel_dummies = pd.get_dummies(X['channel'], prefix='channel_').iloc[:, 1:]
        new_X = pd.concat([new_X, channel_dummies], axis = 1)        
        cols_to_remove = []
        for col in self.g+self.c:
            if col not in new_X.columns:
                new_X[col] = [0]*(new_X.shape[0])
        for col in new_X.columns:
            if ('game_' in col or 'channel_' in col) and col not in self.g+self.c:
                cols_to_remove.append(col)
        new_X = new_X.drop(cols_to_remove+['time', 'city', 'country', 'channel', 'status', 'game', 'preliminary_tag'], axis=1)
        
        return new_X

    def fit(self, X, y=None, **fit_params):
        self.g = pd.get_dummies(X['game'], prefix='game_').iloc[:, 1:].columns
        self.c = pd.get_dummies(X['channel'], prefix='channel_').iloc[:, 1:].columns
        return self



In [22]:
X = twitch.drop('community', axis=1)
y = twitch.community


In [26]:
knn = KNeighborsClassifier()
trans = TwitchTransformer()

# k_range = range(1, 1000, 50)
# param_grid = dict(n_neighbors=k_range)
data = pd.DataFrame(trans.fit_transform(X))

grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
print grid.fit(data, y)
print grid.best_score_ 
print grid.best_params_ 
print grid.best_estimator_ 




GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)
0.717237188576
{'n_neighbors': 98}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=98, p=2,
           weights='uniform')


In [13]:
logreg = LogisticRegression()
trans = TwitchTransformer()

for i in range(1,11):
    feature_filter = SelectKBest(k=i)
    pipe = Pipeline([ 
      ('non_text_features', trans),
      ('anova', feature_filter),
      ('classifier', logreg)
    ])
    mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
    print i, mean




1 0.943488552383
2 0.943488552383
3 0.943488552383
4 0.94308287287
5 0.943488552383
6 0.887820536188
7 0.888022965338
8 0.888832681937
9 0.888832681937
10 0.889439969387


In [14]:
nb = MultinomialNB()

for i in range(1,11):
    feature_filter = SelectKBest(k=i)
    pipe = Pipeline([ 
      ('non_text_features', trans),
      ('anova', feature_filter),
      ('classifier', nb)
    ])
    mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
    print i, mean





1 0.943488552383
2 0.943488552383
3 0.943488552383
4 0.943488552383
5 0.943488552383
6 0.943488552383
7 0.943488552383
8 0.943488552383
9 0.943488552383
10 0.943488552383


In [15]:
rf = RandomForestClassifier()

for i in range(1,11):
    feature_filter = SelectKBest(k=i)
    pipe = Pipeline([ 
      ('non_text_features', trans),
      ('anova', feature_filter),
      ('classifier', rf)
    ])
    mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
    print i, mean




1 0.943488552383
2 0.943488552383
3 0.943488552383
4 0.94308287287
5 0.943488552383
6 0.857456163719
7 0.857456163719
8 0.809278026067
9 0.276545273605
10 0.343144463889


In [52]:
knn = KNeighborsClassifier()
count = MyCountVectorizer()

k_range = range(2,100,2)
param_grid = dict(n_neighbors=k_range)
data = count.fit_transform(X)

grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
print grid.fit(data, y)
print grid.best_score_ 
print grid.best_params_
print grid.best_estimator_ 



GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)
0.974275876038
{'n_neighbors': 2}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')


In [16]:
logreg = LogisticRegression()
count = MyCountVectorizer()

pipe = Pipeline([ 
  ('count', count),
  ('classifier', logreg)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

0.97974803998682591

In [17]:
nb = MultinomialNB()

pipe = Pipeline([ 
  ('count', count),
  ('classifier', nb)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

0.97061775725500488

In [18]:
rf = RandomForestClassifier()

pipe = Pipeline([ 
  ('count', count),
  ('classifier', rf)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

0.98015331056234911

In [31]:
knn = KNeighborsClassifier(n_neighbors=201)
count = MyCountVectorizer()
param_grid = dict(n_neighbors=k_range)


feature_change = FeatureUnion([
        ('counts', MyCountVectorizer()),
        ('other_features', TwitchTransformer())
  ])

pipe = Pipeline([ 
  ('features', feature_change),
  ('classifier', knn)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

# grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
# print grid.fit(X, y)
# print grid.best_score_ 
# print grid.best_params_
# print grid.best_estimator_ 





0.94348855238271123

In [19]:
logreg = LogisticRegression()

feature_change = FeatureUnion([
        ('counts', MyCountVectorizer()),
        ('other_features', TwitchTransformer())
  ])

pipe = Pipeline([ 
  ('features', feature_change),
  ('classifier', logreg)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean




0.93581515597082721

In [20]:
nb = MultinomialNB()

feature_change = FeatureUnion([
        ('counts', MyCountVectorizer()),
        ('other_features', TwitchTransformer())
  ])

pipe = Pipeline([ 
  ('features', feature_change),
  ('classifier', nb)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean





0.94248997501841547

In [21]:
rf = RandomForestClassifier()


feature_change = FeatureUnion([
        ('counts', MyCountVectorizer()),
        ('other_features', TwitchTransformer())
  ])

pipe = Pipeline([ 
  ('features', feature_change),
  ('classifier', rf)
])

mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean




0.97124765979164018