In [9]:
import pandas as pd
import numpy as np
#cleaning text data using textacy
from textacy.preprocess import preprocess_text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

    Load data

In [10]:
tweets = pd.read_csv('data/training_set.csv')

    Clean

In [11]:
tweet_text = tweets['text'].values
clean_text = [preprocess_text(x, fix_unicode=True, lowercase=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True,
                              no_punct=True, no_accents=True)
              for x in tweet_text]

    Modeling

In [12]:
tweets.screen_name.unique()

array(['prattprattpratt', 'azizansari', 'VancityReynolds', 'evilhag',
       'Nick_Offerman', 'mradamscott', 'JimOHeir', 'MeganMullally'],
      dtype=object)

In [13]:
# creating target
def target_encod(x):
    if x == 'prattprattpratt': return 1
    elif x == 'azizansari': return 2
    elif x == 'VancityReynolds': return 3
    elif x == 'evilhag': return 4
    elif x == 'Nick_Offerman': return 5
    elif x == 'mradamscott': return 6
    elif x == 'JimOHeir': return 7
    else: return 8

y = tweets['screen_name'].map(lambda x: x).values
print (max(pd.Series(y).value_counts(normalize=True)))

0.125


In [14]:
# Vectorizing with TF-IDF Vectorizer and creating X matrix
tfv = TfidfVectorizer(ngram_range=(2,4), max_features=2000)
X = tfv.fit_transform(clean_text).todense()
print (X.shape)

(8000, 2000)


In [15]:
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression()
params = {'penalty': ['l1', 'l2'], 'C':np.logspace(-5,0,100)}
#Grid searching to find optimal parameters for Logistic Regression
gs = GridSearchCV(lr, param_grid=params, cv=10, verbose=1)
gs.fit(X, y)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed:  8.4min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e-05, 1.12332e-05, ..., 8.90215e-01, 1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
print (gs.best_params_)
print (gs.best_score_)

{'C': 0.7054802310718645, 'penalty': 'l2'}
0.402375


In [20]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(LogisticRegression(C=0.7054802310718645, penalty='l2'), X, y, cv=10)

accuracies

array([0.37875, 0.35375, 0.43   , 0.4325 , 0.44125, 0.42125, 0.37625,
       0.4    , 0.43375, 0.35625])

    Fit

In [18]:
estimator = LogisticRegression(C=0.7054802310718645, penalty='l2')
estimator.fit(X,y)

LogisticRegression(C=0.7054802310718645, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

    Predictions

In [25]:
# Prep our source as TfIdf vectors
source_test = [
    "Good Morning America. @GMA (is the name of a television program I will appear on this morning) @heartsbeatloud",
    "omg every interview should be like this 👏🏽🙌🏽👍🏽"
   ]

Xtest = tfv.transform(source_test)
pd.DataFrame(estimator.predict_proba(Xtest), columns=['prattprattpratt', 'azizansari', 'VancityReynolds', 'evilhag',
       'Nick_Offerman', 'mradamscott', 'JimOHeir', 'MeganMullally'])

Unnamed: 0,prattprattpratt,azizansari,VancityReynolds,evilhag,Nick_Offerman,mradamscott,JimOHeir,MeganMullally
0,0.080199,0.101831,0.295989,0.118075,0.085917,0.1205,0.122639,0.074851
1,0.079723,0.117248,0.19692,0.106517,0.119884,0.123106,0.133167,0.123435
