# Import all the necessary python libraries

In [None]:
import numpy as np
import pandas as pd
import regex as re
import pickle
import nltk
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# Start the code execution timer, Read the train and test .csv files 

In [2]:
Start = time.time()
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

# PRE-PROCESSING

## Lemmatization of Training & Test Comments

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stopWords = stopwords.words('english')

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

train_text = train["text"]
test_text = test["text"]
train_text = train_text.apply(lemmatize_text) 
test_text = test_text.apply(lemmatize_text)
train_text = train_text.apply(lambda x: ' '.join(([word for word in x]))) # Join back the lemmztized words to form complete sentences.
test_text = test_text.apply(lambda x: ' '.join(([word for word in x])))
train["text"] = train_text
test["text"] = test_text
del train_text, test_text

# train_text = train_text.apply(lambda x: ' '.join(([word for word in x if word not in (stopWords)])))  
# Instead of removing the stop words here, we take care of them during tokenization, vectorization of words and sentences.

## Manual Text Clean-up & Pre-processing

In [4]:
temporary_training_data = []
temporary_testing_data = []
train_comments_list = train["text"].tolist()
test_comments_list = test["text"].tolist()

sensibleWords = {
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    ":-(": " frown ",
    ":(": " frown ",
    ":s": " frown ",
    ":-s": " frown ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    "&lt;3": " heart ",
    ":/": " worry ",
    ":&gt;": " angry ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
}

nonsenseWords = [word for word in sensibleWords.keys()] 
# A set of all words which need to be furthur defined to make them meaningful and understandable by the machine.

for comment in train_comments_list:
    words = str(comment).split()
    sentense = ""
    for word in words:
        # word = str(word).lower()
        # Lower case and upper case words carry different meanings.
        if word[:4] == 'http' or word[:3] == 'www': # If we encounter some links in the comments section, 
            continue                                # then we ignore the unimportant words in the URL of the link and pay attension towards the valuable text.
        if word in nonsenseWords:
            word = sensibleWords[word]
        sentense += word + " "  # Make a sentence out all the cleaned up and relevant words from the given comment.
    temporary_training_data.append(sentense)

for comment in test_comments_list:
    words = str(comment).split()
    sentense = ""
    for word in words:
        # word = str(word).lower() 
        # Lower case and upper case words carry different meanings.
        if word[:4] == 'http' or word[:3] == 'www': # If we encounter some links in the comments section, 
            continue                                # then we ignore the unimportant words in the URL of the link and pay attension towards the valuable text.
        if word in nonsenseWords:
            word = sensibleWords[word]
        sentense += word + " "  # Make a sentence out all the cleaned up and relevant words from the given comment.
    temporary_testing_data.append(sentense)

# Remove all the numbers and punctuation marks apart from '?' and '!' from the training and test comments. 
for i, c in enumerate(temporary_training_data):
    temporary_training_data[i] = re.sub('[^a-zA-Z ?!]+', '', temporary_training_data[i])

for i, c in enumerate(temporary_testing_data):
    temporary_testing_data[i] = re.sub('[^a-zA-Z ?!]+', '', temporary_testing_data[i]) 
    
train["text"] = temporary_training_data
test["text"] = temporary_testing_data
del temporary_training_data, temporary_testing_data

In [5]:
train_text = train["text"]
test_text = test["text"]
complete_text = pd.concat([train["text"], test["text"]]) 
# We concatenate the preprocessed training and test comments so that we get a better fit for the tfidf model.

# EMBEDDING & VECTORIZATION

## Word Embedding - Vectorization of Words using TF-IDF (Analyser = 'word')

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True,  # It seems unlikely that twenty occurrences of a term in a document truly carry twenty times the significance of a single occurrence. 
    # Accordingly, there has been considerable research into variants of term frequency that go beyond counting the number of occurrences of a term. 
    # A common modification is to use instead the logarithm of the term frequency, which assigns a weight. 1 + log(tf)
    strip_accents = 'unicode', # Remove accents and perform other character normalization during the preprocessing step. 
                               # ‘ascii’ is a fast method that only works on characters that have an direct ASCII mapping. 
                               # ‘unicode’ is a slightly slower method that works on any characters.
    analyzer = 'word',  # Whether the feature should be made of word or character.
    token_pattern = '(?u)\\b\\w\\w+\\b\\w{,1}',
    lowercase = False,  # Do not convert the uppercase letters into lowercase because they carry significance.
    stop_words = 'english',  # Remove all the stop words of english 
    # ngram is the set of n words together.
    ngram_range = (1, 2),  # We consider set of 1 or 2 words together for tokenization.
    min_df = 2,
    max_df = 0.5,
    norm = 'l2',
    max_features = 30000
) #lowercase = true : convert all characters into lower case before tokenzing
word_vectorizer.fit(complete_text) # Apply tfidf fitting on the whole preprocessed text data so that we schieve a better fitted model.
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [7]:
# word_vectorizer = TfidfVectorizer(
#     sublinear_tf = True,
#     strip_accents = 'unicode',
#     tokenizer = lambda x: regex.findall(r'[^\p{P}\W]+', x),
#     analyzer = 'word',
#     min_df = 5,
#     norm = 'l2',
#     lowercase = True,
#     ngram_range = (1, 1),
#     max_features = 60000
# )
# word_vectorizer.fit(complete_text)
# train_word_features = word_vectorizer.transform(train_text)
# test_word_features = word_vectorizer.transform(test_text)

In [8]:
# word_vectorizer = TfidfVectorizer(
#     sublinear_tf = True,
#     strip_accents = 'unicode', # Remove accents and perform other character normalization during the preprocessing step. 
                                 # ‘ascii’ is a fast method that only works on characters that have an direct ASCII mapping. 
                                 # ‘unicode’ is a slightly slower method that works on any characters.
#     tokenizer = None,
#     analyzer = 'word',
#     token_pattern = '(?u)\\b\\w\\w+\\b\\w{,1}',
#     min_df = 5,
#     norm = 'l2',
#     lowercase = True,
#     ngram_range = (1, 1),
#     max_features = 60000
# )
# word_vectorizer.fit(complete_text)
# train_word_features = word_vectorizer.transform(train_text)
# test_word_features = word_vectorizer.transform(test_text)

## Character Embedding - Vectorization of Individual Characters using TF-IDF (Analyser = 'char')

In [9]:
char_vectorizer = TfidfVectorizer (
    sublinear_tf = True,
    strip_accents = 'unicode', # Remove accents and perform other character normalization during the preprocessing step. 
                               # ‘ascii’ is a fast method that only works on characters that have an direct ASCII mapping. 
                               # ‘unicode’ is a slightly slower method that works on any characters.
    analyzer = 'char',
    ngram_range = (2, 6),  # ngram is the set of n words together.
    min_df = 2, 
    max_df = 0.5,
    max_features = 20000
)
char_vectorizer.fit(complete_text) # We fit on complete training + test data so as to achieve a better fit.
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
# char_vectorizer = TfidfVectorizer (
#     sublinear_tf = True,
#     strip_accents = 'unicode',
#     analyzer = 'char',
#     ngram_range = (2, 4), 
#     min_df = 5, 
#     max_df = 0.5,
#     max_features = 25000
# )
# char_vectorizer.fit(complete_text)
# train_char_features = char_vectorizer.transform(train_text)
# test_char_features = char_vectorizer.transform(test_text)

In [None]:
# char_vectorizer = TfidfVectorizer (
#     sublinear_tf = True,
#     strip_accents = 'unicode',
#     analyzer = 'char',
#     ngram_range = (2, 6), 
#     min_df = 2, 
#     max_df = 0.5,
#     max_features = 60000
# )
# char_vectorizer.fit(complete_text)
# train_char_features = char_vectorizer.transform(train_text)
# test_char_features = char_vectorizer.transform(test_text)

## Merge the character and word vectorization results horizontally together to get the actual training and test features

In [10]:
train_features = hstack([train_char_features, train_word_features]) # Horizontally merging the training and test features. 
test_features = hstack([test_char_features, test_word_features])

# Categorisation of Test Data & Naive Bayes Feature Equation

In [11]:
categories = ['harsh', 'extremely_harsh', 'vulgar', 'threatening', 'disrespect', 'targeted_hate']
temp = train_features.tocsr()

# Naive Bayes Feature Equation
def pr(y_i, y): 
    p = temp[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)



# CLASSIFICATION MODELS

## Pickle File: 
1. Model once trained can be stored in pickle(.pckl) file.
2. Model can be loaded from this file to make predictions without re-training.
3. This saves us training time making the code more efficient.
4. The application of pickle file is to serialize your machine learning algorithms and save the serialized format.

## 1. Voting Classifier: Trains on 4 different classification models

In [None]:
cross_validation_scores = []
ratio_values = []
model_storage = open('model_storage.pckl', 'wb') # Pickle file for storing the trained models.

# A Voting Classifier is a machine learning model that trains on an ensemble of numerous models:
# 1. Logistic Regression.
# 2. Random Forest Classifier.
# 3. Easy Ensemble Classifier with logistic regression as the base eliminator.
# 4. Easy Ensemble Classifier with SGDC Classifier as the base eliminator.
# The voting classifier predicts an output (class) based on the highest probability of the varioes models of chosen class as the output.

for category in categories:
    train_target = train[category]
    train_target_values = train_target.values
    r = np.log(pr(1, train_target_values) / pr(0, train_target_values))
    x_nb = train_features.multiply(r)
    rfc = RandomForestClassifier(max_features = 999, max_depth = 100, min_samples_split = 10, criterion = 'gini', n_estimators = 120, min_weight_fraction_leaf = 0.0, max_leaf_nodes = None)
    lr = LogisticRegression(max_iter = 499, dual = False, C = 2)
    eec_lr = EasyEnsembleClassifier(base_estimator = LogisticRegression(solver = 'sag', max_iter = 450, C = 2)) # sag solver proved to be faster and more accurate than liblinear solver and the default solver. 
    eec_sgdc = EasyEnsembleClassifier(base_estimator = SGDClassifier(max_iter = 165, alpha = 0.0002, penalty = "l2", loss = 'modified_huber'))
    vote = VotingClassifier(voting = 'soft', estimators = [('lr', eec_lr), ('sgd', eec_sgdc), ('lr1', lr), ('rdf', rfc)], weights = [0.9, 1.35, 0.65, 0.8])
    vote.fit(x_nb, train_target_values)
    ratio_values.append(r)  
    pickle.dump(vote, model_storage)  # Dump the model into the pickle file after fitting it on the training data.
    cv_score = np.mean(cross_val_score(vote, x_nb, train_target, cv = 5, scoring = 'roc_auc'))
    cross_validation_scores.append(cv_score)
    #print('Cross Validation score for class {} is {}'.format(category, cv_score))

print('Cross Validation score is {}'.format(np.mean(cross_validation_scores))) # Total cross validation score.
model_storage.close()

## 2. Ridge Classification Model

In [13]:
# cross_validation_scores = []
# model_storage = open('model_storage.pckl', 'wb')

# for category in categories:
#     train_target = train[category]
#     #rigde = Ridge(alpha = 29, copy_X = True, fit_intercept = True, solver = 'sag', max_iter = 500, normalize = False, random_state = 0, tol = 0.0025)
#     ridgeClassifier = Ridge(solver = 'sag', max_iter = 150, fit_intercept = True, tol = 0.0025, alpha = 29, copy_X = True, random_state = 0)
#     cv_score = np.mean(cross_val_score(ridgeClassifier, train_features, train_target, cv = 3, scoring = 'roc_auc'))
#     cross_validation_scores.append(cv_score)
#     #print('Cross Validation score for class {} is {}'.format(category, cv_score))
#     ridgeClassifier.fit(train_features, train_target)
#     pickle.dump(ridgeClassifier, model_storage)

# model_storage.close()
# print('Cross Validation score is {}'.format(np.mean(cross_validation_scores)))

Cross Validation score is 0.9834960337914218


## 3. Random Forest Classification Model

In [None]:
# cross_validation_scores = []
# result = pd.DataFrame.from_dict({'id': test['id']})
# model_storage = open('model_storage.pckl', 'wb')

# for category in categories:
#     train_target = train[category]
#     rfc = RandomForestClassifier(max_features = 999, max_depth = 100, min_samples_split = 10, criterion = 'gini', n_estimators = 120, min_weight_fraction_leaf = 0.0, max_leaf_nodes = None)
#     cv_score = np.mean(cross_val_score(rfc, train_features, train_target, cv=3, scoring='roc_auc'))
#     cross_validation_scores.append(cv_score)
#     #print('CV score for class {} is {}'.format(category, cv_score))
#     rfc.fit(train_features, train_target)
#     pickle.dump(rfc, model_storage)

# model_storage.close()
# print('Cross Validation score is {}'.format(np.mean(cross_validation_scores)))

## 4. Logistic Regression CV along with Sampling

In [None]:
# from imblearn.under_sampling import NearMiss  # Under Sampling
# nm = NearMiss(random state = 100)

# # from imblearn.over_sampling import SMOTE  # Over Sampling
# # smk = SMOTE(random_state = 12)

# from sklearn.linear_model import LogisticRegressionCV
# lr = LogisticRegressionCV(solver = 'liblinear', n_jobs = -1) # liblinear solver proved to be the best

# for category in categories:
#     Y_train = train_df[category].to_numpy().astype(np.float64)
#     X_train_balanced, Y_train_balanced = nm.fit_resample(X_train, Y_train)
#     lr.fit(X_train_balanced, Y_train_balanced)
#     pickle.dump(lr, model_storage)

# model_storage.close()

# Predictions on the test data (Separately for all categories) & output .csv preparation 

In [None]:
Output = pd.DataFrame.from_dict({'id': test['id']})
models = []

with open("model_storage.pckl", "rb") as file:
    while True:
        try:
            models.append(pickle.load(file)) # Collect the models that have been saved in the pickle file and make the probability predictions accordingly.
        except EOFError:
            break

# We need to use .predict_proba(test data) for all other models except Ridge Classifier.
# The ridge classifier works with simple .predict(test data) function.
i=0  
for category in categories:
    train_target = train[category]
    #Output[category] = models[i].predict(test_features)
    Output[category] = models[i].predict_proba(test_features.multiply(ratio_values[i]))[:, 1] # Predictions made by the trained models on the test data.
    i=i+1

Output.to_csv('FinalSubmission.csv', index = False) # Final kaggle submission .csv file.

End = time.time()
print('Time of execution: {} minutes'.format((End - Start) / 60)) # Print the code execution time.

Time of execution: 371.9367505431175 minutes
