In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def mappingfn1(x):
    if x < 0.01:
        return 0
    else:
        return 1

def mappingfn2(x):
    if x < 0.099999:
        return 0
    if x > 0.099999:
        return 1

def zeroes(x):
    if x == 0:
        return 0
    
def excluder(x):
    if x == 1:
        return x

def remove_stopwords(text):
    stopwords_list = stopwords
    word_tokens = word_tokenize(text)
    result = []
    for w in word_tokens:
        if w not in stopwords_list:
            result.append(w)
    return ' '.join(result)

def remove_special(text, lower=True):
    if lower:
        text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(
        text.split()
    )
    return text

def remove_repeat(text, repeat=1):
    text = text.split(' ')
    result = []
    for word in text:
        if result.count(word)<repeat:
            result.append(word)
    return ' '.join(result)

In [3]:
# Build two dataframes and fit vectorizer on the negative class (much more frequent in production) 
# to avoid overfitting and increase the robustness of model 

trainpos = pd.read_csv('train_stemmed.csv')
trainpos.fillna(0, inplace=True)
trainpos['effectiveness'] = trainpos['target'].apply(mappingfn2)
trainpos['effectiveness'] = trainpos['effectiveness'].apply(zeroes)
trainpos.dropna(inplace=True)
trainpos = trainpos.sample(frac=0.6).reset_index(drop=True)

train = pd.read_csv('train_stemmed.csv')
train.fillna(0, inplace=True)
train['effectiveness'] = train['target'].apply(mappingfn2)
train['effectiveness'] = train['effectiveness'].apply(excluder)
train.dropna(inplace=True)
train = train.sample(frac=0.95).reset_index(drop=True)
train = pd.concat([trainpos, train])

379518

In [5]:
# Create stopword list

cv = CountVectorizer(ngram_range = (1, 1)) 
documents = train.comment_text.tolist()
documents = [' '.join(documents)] 

X = cv.fit_transform(documents).toarray()
freqs = X.flatten() 
words = cv.get_feature_names() 

df_word = pd.DataFrame({'word': words, 'freq': freqs})
df_word = df_word.sort_values(by='freq', ascending=False)

df_word = df_word.reset_index().drop(['index'],axis=1)

stopwords_list = df_word.word.tolist()[:150]

from nltk.corpus import stopwords
stopwords_list += stopwords.words('english')
stopwords = set(stopwords_list)

In [8]:
# Build test set

test_dic = dict(zip(train.id, [0] * len(train.id)))

def exclude_test_set(x):
    if x not in test_dic:
        return x

train['preprocess_text'] = train['preprocess_text'].apply(lambda x: str(x))
train2 = pd.read_csv('train_stemmed.csv')
train2['id'] = train2['id'].apply(exclude_test_set)
train2.dropna(inplace=True)
train2['preprocess_text'] = train2['preprocess_text'].apply(lambda x: str(x))
preprocess_text2 = train2.preprocess_text

In [11]:
# Create vectorizer & split the data
# We use a very small test size as the models have been cross-validated on a 4-fold split beforehand. 

preprocess_text = train.preprocess_text

vectorizer = CountVectorizer(max_features=10000, ngram_range = (1, 2))

X = vectorizer.fit_transform(preprocess_text)

y = train.target

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.0001, random_state=0) 

In [12]:
X = vectorizer.transform(preprocess_text)

In [13]:
import pickle
with open('vectorizer_final.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    f.close()

In [14]:
print(' ')
print('----prediction of {} column----')
print(' ')
y = np.where(y_train >= 0.3, 1, 0)
model = LogisticRegression()
model.fit(X_train,y)
y_pred = model.predict(X_valid)
X = vectorizer.transform(preprocess_text2)
y = np.where(train2['target'] >= 0.3, 1, 0)
y_pred1 = model.predict(X)
print(classification_report(y, y_pred1))

 
----prediction of {} column----
 
              precision    recall  f1-score   support

           0       0.98      0.96      0.97    258550
           1       0.46      0.63      0.53     13304

    accuracy                           0.95    271854
   macro avg       0.72      0.80      0.75    271854
weighted avg       0.96      0.95      0.95    271854



In [15]:
with open('logisticRegression03.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()

In [16]:
print(' ')
print('----prediction of {} column----')
print(' ')
y = np.where(y_train >= 0.4, 1, 0)
model = LogisticRegression()
model.fit(X_train,y)
y_pred = model.predict(X_valid)
X = vectorizer.transform(preprocess_text2)
y = np.where(train2['target'] >= 0.4, 1, 0)
y_pred1 = model.predict(X)
print(classification_report(y, y_pred1))

 
----prediction of {} column----
 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    261652
           1       0.56      0.57      0.57     10202

    accuracy                           0.97    271854
   macro avg       0.77      0.78      0.77    271854
weighted avg       0.97      0.97      0.97    271854



In [17]:
with open('logisticRegression04.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()

In [18]:
print(' ')
print('----prediction of {} column----')
print(' ')
y = np.where(y_train >= 0.5, 1, 0)
model = LogisticRegression()
model.fit(X_train,y)
y_pred = model.predict(X_valid)
X = vectorizer.transform(preprocess_text2)
y = np.where(train2['target'] >= 0.5, 1, 0)
y_pred1 = model.predict(X)
print(classification_report(y, y_pred1))

 
----prediction of {} column----
 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    264457
           1       0.62      0.51      0.56      7397

    accuracy                           0.98    271854
   macro avg       0.80      0.75      0.78    271854
weighted avg       0.98      0.98      0.98    271854



In [19]:
with open('logisticRegression05.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()

In [20]:
y = np.where(y_train >= 0.3, 1, 0)
model = MultinomialNB()
model.fit(X_train,y)
y_pred = model.predict(X_valid)
X = vectorizer.transform(preprocess_text2)
y = np.where(train2['target'] >= 0.3, 1, 0)
y_pred3 = model.predict(X)
print(classification_report(y, y_pred3))

              precision    recall  f1-score   support

           0       0.99      0.85      0.91    258550
           1       0.21      0.75      0.33     13304

    accuracy                           0.85    271854
   macro avg       0.60      0.80      0.62    271854
weighted avg       0.95      0.85      0.89    271854



In [21]:
with open('multinomialNB03.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()

In [22]:
y = np.where(y_train >= 0.5, 1, 0)
model = SGDClassifier()
model.fit(X_train,y)
y_pred = model.predict(X_valid)
X = vectorizer.transform(preprocess_text2)
y = np.where(train2['target'] >= 0.5, 1, 0)
y_pred6 = model.predict(X)
print(classification_report(y, y_pred6))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    264457
           1       0.63      0.48      0.54      7397

    accuracy                           0.98    271854
   macro avg       0.81      0.73      0.76    271854
weighted avg       0.98      0.98      0.98    271854



In [23]:
with open('SGD05.pkl', 'wb') as f:
    pickle.dump(model, f)
    f.close()