In [1]:
import pandas as pd
%matplotlib inline

In [2]:
stream_path = r"C:\Users\anees\Documents\CS\MP\datasets\comment_stream_117k.csv"
mod_path = r"C:\Users\anees\Documents\CS\MP\datasets\top_week_full.csv"

In [3]:
stream_df = pd.read_csv(stream_path)
mod_df = pd.read_csv(mod_path)

In [4]:
stream_df.head()

Unnamed: 0,comment,subreddit
0,"Yeah, my library doesn't have DVD's. Books and...",funny
1,yes yes yes. I don't understand why more peopl...,ufl
2,So dreamy.,askgaybros
3,Thank you! First painting i'm feeling proud of...,apexlegends
4,Grammar,PewdiepieSubmissions


In [5]:
mod_df.head()

Unnamed: 0,subreddit,author,comment,score
0,movies,mi-16evil,[Gunn has confirmed this on his Instagram](htt...,1
1,movies,McFeely_Smackup,"""Sorry James, you're fired""\n\n""Ok, I guess I'...",5082
2,movies,skepticallypessimist,Thats illegal,662
3,movies,dwide_k_shrude,He will make it legal.,266
4,movies,apollodeen,We'll pay you 20 million and give you GOTG 3 i...,107


In [6]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import pickle

In [7]:
classifiers = []
for i in range(6):
    classifiers.append(joblib.load("lr_char_" + str(i) + ".joblib"))

In [8]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

In [9]:
word_vectorizer = pickle.load(open("word_vectorizer.pickle",'rb'))
char_vectorizer = pickle.load(open("char_vectorizer.pickle",'rb'))

In [10]:
print(mod_df.shape)
mod_df = mod_df.dropna()
print(mod_df.shape)

(101362, 4)
(101358, 4)


In [11]:
print(stream_df.shape)
stream_df = stream_df.dropna()
print(stream_df.shape)

(117242, 2)
(117242, 2)


In [12]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [13]:
mod_df['toxic'] = 0
mod_df['severe_toxic'] = 0
mod_df['obscene'] = 0
mod_df['threat'] = 0
mod_df['insult'] = 0
mod_df['identity_hate'] = 0

In [14]:
stream_df['toxic'] = 0
stream_df['severe_toxic'] = 0
stream_df['obscene'] = 0
stream_df['threat'] = 0
stream_df['insult'] = 0
stream_df['identity_hate'] = 0

In [15]:
print("generating word features")
mod_df_wordfeat = word_vectorizer.transform(mod_df.comment)
print("creating char features")
mod_df_charfeat = char_vectorizer.transform(mod_df.comment)
print("done")
mod_df_feat = hstack([mod_df_charfeat,mod_df_wordfeat])

generating word features
creating char features
done


In [16]:
i = 0
for class_name in class_names:
    mod_df[class_name] = classifiers[i].predict_proba(mod_df_feat)[:,1]
    i+=1

In [17]:
mod_df_wordfeat = 0
mod_df_charfeat = 0
mod_df_feat = 0

In [18]:
print("generating word features")
stream_df_wordfeat = word_vectorizer.transform(stream_df.comment)
print("creating char features")
stream_df_charfeat = char_vectorizer.transform(stream_df.comment)
print("done")
stream_df_feat = hstack([stream_df_charfeat,stream_df_wordfeat])

generating word features
creating char features
done


In [19]:
i = 0
for class_name in class_names:
    stream_df[class_name] = classifiers[i].predict_proba(stream_df_feat)[:,1]
    i+=1

In [20]:
stream_df_wordfeat = 0
stream_df_charfeat = 0
stream_df_feat = 0

In [21]:
stream_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,117242.0,117242.0,117242.0,117242.0,117242.0,117242.0
mean,0.122573,0.009505,0.062581,0.003517,0.05033,0.009889
std,0.156798,0.028083,0.130044,0.004244,0.085787,0.012836
min,0.002468,0.000824,0.002985,0.000846,0.002087,0.001552
25%,0.04853,0.004818,0.023183,0.002597,0.021627,0.006613
50%,0.074789,0.006279,0.031806,0.00307,0.030463,0.008282
75%,0.121335,0.008209,0.04562,0.00363,0.044849,0.010313
max,0.999995,0.90839,0.999999,0.51358,0.999074,0.883107


In [22]:
mod_df.describe()

Unnamed: 0,score,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,101358.0,101358.0,101358.0,101358.0,101358.0,101358.0,101358.0
mean,70.726248,0.137612,0.010313,0.070519,0.003588,0.056035,0.010935
std,713.008978,0.165839,0.027612,0.140782,0.003521,0.091188,0.015792
min,-207.0,0.003432,0.001425,0.003622,0.001116,0.003547,0.001873
25%,1.0,0.056494,0.005274,0.025612,0.002693,0.024107,0.007219
50%,2.0,0.083435,0.006701,0.034378,0.00314,0.032956,0.008871
75%,6.0,0.136652,0.008611,0.04945,0.003679,0.049208,0.011031
max,69534.0,0.99998,0.868803,0.999981,0.275456,0.998294,0.742926


In [23]:
stream_df.to_csv("stream_pred.csv", index=False)
mod_df.to_csv("top_week_full_pred.csv", index=False)