In [1]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_json('categorized-comments.jsonl', lines=True)

# Create test and train datasets
df = df.head(n=500)

df['id'] = df.index + 1

df.dropna(axis=0)

df.set_index('id', inplace = True)

msk = np.random.rand(len(df)) < 0.75

train = df[msk]

test = df[~msk]

train.head()

Unnamed: 0_level_0,cat,txt
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,sports,Barely better than Gabbert? He was significant...
2,sports,Fuck the ducks and the Angels! But welcome to ...
3,sports,Should have drafted more WRs.\n\n- Matt Millen...
4,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
6,sports,Ding dong the Kaepers gone!!!!!! Yes!!!! Frida...


In [39]:
df = train

In [40]:
import re
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['txt'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the average word length
    df['commas'] = df['txt'].apply(lambda x: x.count(','))

    return(df)

df = processing(df)

df.head()

Unnamed: 0_level_0,cat,txt,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,sports,Barely better than Gabbert? He was significant...,barely better than gabbert he was significantl...,505,94,55,5.327273,8
2,sports,Fuck the ducks and the Angels! But welcome to ...,fuck the ducks and the angels but welcome to a...,64,13,7,5.0,0
3,sports,Should have drafted more WRs.\n\n- Matt Millen...,should have drafted more wrs\n\n matt millen p...,51,8,5,6.0,0
4,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),donehttpsiimgurcom2yz90pmjpg,28,1,1,28.0,0
6,sports,Ding dong the Kaepers gone!!!!!! Yes!!!! Frida...,ding dong the kaepers gone yes friday off to a...,57,12,8,4.625,0


In [41]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c  not in ['id','txt','cat']]
numeric_features= [c for c in df.columns.values if c  not in ['id','txt','cat','processed']]
target = 'cat'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
390,anybody drink craft beer here,29,5,4,5.25,0
162,jennifer joe honey can you take out the garbag...,218,40,22,5.863636,3
42,they asked him to prove it and he didnt live u...,479,95,52,5.25,1
47,third firstround dl in three years seems a bi...,114,20,15,5.666667,0
216,he can be undecided all he likes dont see anyw...,165,37,17,4.352941,1


In [42]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Dump the trained decision tree classifier with Pickle
tfidf_model_file = 'TfidfVectorizer.pkl'

# Open the file to save as pkl file
tfidf_model_pkl = open(tfidf_model_file, 'wb')

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

tfidf_model = text.fit_transform(X_train)
pickle.dump(tfidf_model, tfidf_model_pkl)
tfidf_model

<243x1504 sparse matrix of type '<class 'numpy.float64'>'
	with 2865 stored elements in Compressed Sparse Row format>

In [44]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-6.87409694e-01],
       [ 4.71529589e-01],
       [ 2.07196955e+00],
       [-1.66193614e-01],
       [ 1.46536034e-01],
       [-3.62416138e-01],
       [-7.30333371e-01],
       [-7.30333371e-01],
       [ 7.29071652e-01],
       [-3.93075908e-01],
       [ 2.09036541e+00],
       [-1.66193614e-01],
       [ 2.36017138e+00],
       [-4.11471769e-01],
       [-4.35545366e-02],
       [-3.99207861e-01],
       [-4.97319123e-01],
       [-6.75145786e-01],
       [ 5.38981082e-01],
       [-5.58638662e-01],
       [-6.01562339e-01],
       [ 2.12102518e+00],
       [ 2.17008081e+00],
       [-3.50152230e-01],
       [-4.72791308e-01],
       [ 3.00289099e-02],
       [-3.86943954e-01],
       [-6.87409694e-01],
       [ 7.90391191e-01],
       [ 1.77763576e+00],
       [ 5.45567254e-02],
       [-1.29401891e-01],
       [-8.22312679e-01],
       [ 9.55953945e-01],
       [-6.13826247e-01],
       [-2.82700738e-01],
       [ 1.77650022e-02],
       [-3.19492461e-01],
       [-3.4

In [45]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

In [46]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<243x1509 sparse matrix of type '<class 'numpy.float64'>'
	with 4080 stored elements in Compressed Sparse Row format>

In [47]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test)

1.0

In [48]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__text', 'features__length', 'features__words', 'features__words_not_stopword', 'features__avg_word_length', 'features__commas', 'features__text__memory', 'features__text__steps', 'features__text__selector', 'features__text__tfidf', 'features__text__selector__key', 'features__text__tfidf__analyzer', 'features__text__tfidf__binary', 'features__text__tfidf__decode_error', 'features__text__tfidf__dtype', 'features__text__tfidf__encoding', 'features__text__tfidf__input', 'features__text__tfidf__lowercase', 'features__text__tfidf__max_df', 'features__text__tfidf__max_features', 'features__text__tfidf__min_df', 'features__text__tfidf__ngram_range', 'features__text__tfidf__norm', 'features__text__tfidf__preprocessor', 'features__text__tfidf__smooth_idf', 'features__text__tfidf__stop_words', 'features__text__tfidf__strip_accents', 'features__text__

In [49]:
from sklearn.model_selection import GridSearchCV

hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                   'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
clf = GridSearchCV(pipeline, hyperparameters, cv=5)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='processed')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inpu...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'features__text__tfidf__max_df': [0.9, 0.95], 'features__text__tfidf__ngram_range': [(1, 1), (1, 2)], 'classifier__max_depth': [50, 70], 'classifier__min_samples_leaf': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
clf.best_params_

{'classifier__max_depth': 50,
 'classifier__min_samples_leaf': 1,
 'features__text__tfidf__max_df': 0.9,
 'features__text__tfidf__ngram_range': (1, 1)}

In [51]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

1.0

In [52]:
X_test.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
227,more the organization not the league buying th...,58,9,4,8.5,0
44,oh well looks like im not the only one who mis...,55,12,7,3.857143,0
19,deleted,7,1,1,7.0,0
429,dude they shouldnt be calling any pass play in...,76,14,8,5.625,0
73,i know we wont be watchable next year but i at...,217,39,21,6.190476,1


In [55]:
submission = test

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result

In [60]:
data = [[1], [1], [0]] 
  
# Create the pandas DataFrame 
y = pd.DataFrame(data, columns = ['P'])

In [62]:
pd.concat([x[['Name']], y], axis=1)

Unnamed: 0,Name,P
0,tom,1
1,nick,1
2,juli,0


In [63]:
test

Unnamed: 0,cat,txt,processed,length,words,words_not_stopword,avg_word_length,commas
4,sports,No!! NOO!!!!!,no noo,6,2,1,3.000000,0
14,sports,o/,o,1,1,0,0.000000,0
16,sports,I believe in upvotes!!,i believe in upvotes,20,4,2,7.000000,0
23,sports,No,no,2,1,0,0.000000,0
27,sports,"I guess at this point, there's just no reason ...",i guess at this point theres just no reason no...,163,30,15,6.000000,2
29,sports,This is under the assumption that the reports ...,this is under the assumption that the reports ...,359,75,35,5.057143,1
31,sports,Needs more filters and comic sans font,needs more filters and comic sans font,38,7,5,5.000000,0
35,sports,Eric Berry was a higher rated prospect than ET...,eric berry was a higher rated prospect than et...,383,73,43,5.418605,3
36,sports,Thanks again Cleveland and Houston,thanks again cleveland and houston,35,6,4,5.500000,0
40,sports,I would go after receivers who prefer to carry...,i would go after receivers who prefer to carry...,74,15,8,4.875000,0
