In [2]:
pip install googletrans==3.1.0a0 --upgrade --quiet

[K     |████████████████████████████████| 61kB 4.2MB/s 
[K     |████████████████████████████████| 1.2MB 17.0MB/s 
[K     |████████████████████████████████| 51kB 6.6MB/s 
[K     |████████████████████████████████| 71kB 7.9MB/s 
[K     |████████████████████████████████| 61kB 7.5MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone


In [3]:
pip install catboost --upgrade --quiet

[K     |████████████████████████████████| 69.2MB 41kB/s 
[?25h

In [4]:
# importing the important library
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk 
import re
import seaborn as sns
from googletrans import Translator, constants
from pprint import pprint
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from scipy.sparse import hstack
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.utils import resample



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
project_data = pd.read_csv('HOT_Dataset_modified.csv', encoding='utf-8', header=None)
project_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,@saud5683 @Mutayyab420 @shivang598 @Ranask35 @...,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,2.0,"Banti hai empowered woman, feminism pe gyan pe...",,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,
4,2.0,RT @kim_jong_korea: @updatingwait @Acutereply ...,,,,,,,,,,,,,,,,,


In [6]:
project_data = project_data.dropna(how='all')

In [7]:
project_data = project_data.iloc[0::, 0:2]

In [8]:
project_data[0].value_counts()

2.0    1765
0.0    1121
1.0     303
Name: 0, dtype: int64

In [9]:
project_data.columns = ['label', 'tweet']

In [10]:
project_data.isnull().any()

label    False
tweet    False
dtype: bool

## Function for Data Cleaning

In [11]:
def userid(tweet):
    ''' This function calculates the number of userids in the tweets'''
    count = 0
    for i in tweet.split():
        if i[0] == '@':
            count += 1
    return count

def profanity_vector(tweet):
    
    ''' This functions calculates the profanity vector for a given tweet '''
    
    bad_words = pd.read_csv('Hinglish_Profanity_List.csv', engine='python', header=None)
    bad_words.columns = ['Hinglish', 'English', 'Level']
    hinglish = bad_words['Hinglish'].values
    level = bad_words['Level'].values
    PV = [0] * len(level)
    for word in tweet.split():
        if word in hinglish:
            idx = np.where(hinglish == word)
            PV[level[idx][0]] = 1
    return PV

def translation(tweet):
    
    ''' This function translates the hinglish tweet into english '''
    translator = Translator()
    trans = translator.translate(tweet)
    trans_tweet = trans.text
    
    return trans_tweet.lower()

def stopword(data):
    
    ''' This function removes the stopwords from the given sentence'''
    clean = []
    stop_words = set(STOPWORDS)
    
    for tweet in data:
        sentence = []
        for word in tweet.split():
            if word not in stop_words:
                sentence.append(word)
        clean.append(sentence)
    return clean

def Lemmatizer(tweet):
    
    ''' This function uses NLTK lemmatization method and clean the sentence'''
    lemma = []
    lemmatizer = WordNetLemmatizer()
    
    for word in tweet:
        sentence = []
        for i in word:
             sentence.append(lemmatizer.lemmatize(i))
        lemma.append(' '.join(sentence))
    return lemma

def SID(tweet):
    
    ''' This function calculates the NLTK sentiments and return the negative, neutral, postive and compound values'''
    negative = []
    neutral = []
    positive = []
    compound = []
    
    sid = SentimentIntensityAnalyzer()
    sentiment_score = sid.polarity_scores(tweet)
    
    negative.append(sentiment_score['neg'])
    neutral.append(sentiment_score['neu'])
    positive.append(sentiment_score['pos'])
    compound.append(sentiment_score['compound'])
    
    return negative, neutral, positive, compound

def imp_features(data, y, keep):

    rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
    rf.fit(data, y)
    imp_feature = np.argsort(rf.feature_importances_)[::-1]

    return imp_feature[:keep]


def cleaning(data):
    
    ''' This functions clean the input text'''
    
    user_ids = []
    clean_data_hinglish = []
    clean_translated_data = []
    prof_vector = []
    
    for tweet in tqdm(data):
        userids = userid(tweet)
        clean_text = []
        tweet = re.sub(r'\\n', ' ', tweet)  # replacing '\\n' with a space
        tweet = re.sub(r',', ' ', tweet)    # replacing ','  with a space
        tweet = re.sub(r'RT|rt', '', tweet)
        
        for word in tweet.split():
            if word[0] == '@':              # removing user_ids 
                clean_word = re.sub(word, 'username', word)
            else:
                clean_word = word.lower()       # lowercase all the words
                clean_word = re.sub(r'^#\w+', ' ', clean_word)
                #clean_word = re.sub(r'^\\[a-z0-9].*\\[a-z0-9{3}+]*[^\\n]$', '', clean_word)   # removing emotions in unicode
                clean_word = re.sub(r'\\', ' ', clean_word)
                clean_word = re.sub(r'^https:[\a-zA-Z0-9]+', '', clean_word)              # replacing url link with 'url'
                #clean_word = re.sub(r'[^a-z].\w+', '', clean_word)           # removing evering thing except a-z
                clean_word = re.sub(r'[!,.:_;$%^\'\#"&]', '', clean_word)
                clean_text.append(clean_word)
                
        clean_text = (' ').join(clean_text)
    
        PV = profanity_vector(clean_text)  # calling profanity_vector function
        translated_tweet = translation(clean_text)  #calling translated_tweet function
        
        user_ids.append(userids)
        clean_data_hinglish.append(clean_text)
        clean_translated_data.append(translated_tweet)
        prof_vector.append(PV)
        
        
    clean_data_hinglish = np.asarray(clean_data_hinglish)
    user_ids = np.asarray(user_ids).reshape(-1,1)
    prof_vector = np.asarray(prof_vector)
    clean_translated_data = np.asarray(clean_translated_data)

        
    return clean_data_hinglish, user_ids, prof_vector, clean_translated_data

### Function for feature Engineering

In [12]:
def feature_process(clean_data_train, clean_data_test, userids_train, userids_test, PV_train, PV_test):
    ''' This function except the clean data and return Train and Test dataset after stacking userids, profanity vector, negative sentiment, neutral sentiment, 
                    positive sentiment, compound sentiment, n-grams and tfidf features'''
    
    vectorizer = CountVectorizer()
    tfidf = TfidfVectorizer()
    scaler = MinMaxScaler()
    negative_train, negative_test = [], []
    neutral_train, neutral_test = [], []
    positive_train, positive_test  = [], []
    compound_train, compound_test  = [], []

    for tweet in clean_data_train:
        neg, neu, pos, comp = SID(tweet)
        negative_train.append(neg), neutral_train.append(neu), positive_train.append(pos), compound_train.append(comp)
    
    for tweet in clean_data_test:
        neg, neu, pos, comp = SID(tweet)
        negative_test.append(neg), neutral_test.append(neu), positive_test.append(pos), compound_test.append(comp)
    
    clean_data_SW_train = stopword(clean_data_train)
    clean_data_SW_test = stopword(clean_data_test)
    
    clean_data_lemm_train = Lemmatizer(clean_data_SW_train)
    clean_data_lemm_test = Lemmatizer(clean_data_SW_test)
    
    vectorizer.fit(clean_data_lemm_train)
    tfidf.fit(clean_data_lemm_train)
    
    n_grams_train = vectorizer.transform(clean_data_lemm_train)
    tfidf_ngrams_train = tfidf.transform(clean_data_lemm_train)
    
    n_grams_test = vectorizer.transform(clean_data_lemm_test)
    tfidf_ngrams_test = tfidf.transform(clean_data_lemm_test)
    
    negative_train, negative_test = np.asarray(negative_train), np.asarray(negative_test)
    neutral_train, neutral_test = np.asarray(neutral_train), np.asarray(neutral_test)
    positive_train, positive_test  = np.asarray(positive_train), np.asarray(positive_test)
    compound_train, compound_test = np.asarray(compound_train), np.asarray(compound_test)
    userids_train = scaler.fit_transform(userids_train)
    userids_test = scaler.transform(userids_test)
    
    train_dataset = hstack((userids_train, PV_train, negative_train, neutral_train, positive_train, compound_train, n_grams_train, tfidf_ngrams_train))
    
    test_dataset = hstack((userids_test, PV_test, negative_test, neutral_test, positive_test, compound_test, n_grams_test, tfidf_ngrams_test))
    
    
    return train_dataset, test_dataset

## Spliting Train and Test Dataset into 80:20


In [13]:
X = project_data['tweet']
y = project_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) 

In [14]:
# calling the cleaning function which returns the values for the train dataset.

clean_data_hinglish_train, user_ids_train, prof_vector_train, clean_translated_data_train = cleaning(X_train)

100%|██████████| 2551/2551 [15:55<00:00,  2.67it/s]


In [15]:
# calling the cleaning function which returns the values for the test dataset

clean_data_hinglish_test, user_ids_test, prof_vector_test, clean_translated_data_test = cleaning(X_test)

100%|██████████| 638/638 [04:02<00:00,  2.63it/s]


In [16]:
#Calling feature_process which return the complete train and test dataset.

Train, Test = feature_process(clean_translated_data_train, clean_translated_data_test, user_ids_train, user_ids_test, prof_vector_train, prof_vector_test)

In [17]:
print('{} is the shape of Train Dataset and {} is the shape of Test Dataset'.format(Train.shape, Test.shape))

(2551, 14022) is the shape of Train Dataset and (638, 14022) is the shape of Test Dataset


In [None]:
def plot_confusion_matrix(test_y, predict_y):
    
    '''This function returns confusion matrix, precison matrix and recall matrix for 3 class classification'''
    
    C = confusion_matrix(test_y, predict_y)
    print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    B =(C/C.sum(axis=0))
    
    labels = ['Non Offensive', 'Hate Speech', 'Abusive']
    cmap=sns.light_palette("green")
    
    # representing A in heatmap format
    print("-"*50, "Confusion matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    
    print("-"*50, "Precision matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print("Sum of columns in precision matrix",B.sum(axis=0))
    
    # representing B in heatmap format
    print("-"*50, "Recall matrix" , "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print("Sum of rows in Recall matrix",A.sum(axis=1))

## Custom Stacking Classifier:

##### a. Splitting the train data into 2 parts

In [18]:
# splitting the Actuall Train Dataset into 50:50

Train_D1, Train_D2, y_train_D1, y_train_D2 = train_test_split(Train, y_train, test_size=0.5, stratify=y_train, random_state=42)

print('{} is the shape of Train Data D1 and {} is the shape of Train data D2.'.format(Train_D1.shape, Train_D2.shape))

(1275, 14022) is the shape of Train Data D1 and (1276, 14022) is the shape of Train data D2.


##### b. Intializing different type of classifiers

In [30]:
# Initialing Support Vector classifier with Linear Kernel
SVC_Linear = SVC(C=100, gamma="auto", kernel = "linear", probability=True, random_state=42)

# Initializing Support Vector classifier with Poly kernel
SVC_Poly = SVC(C = 75, degree = 2, gamma = "auto", probability=True, kernel = "poly", random_state=42)

# Initializing Support Vector classifier with RBF kernel
SVC_RBF = SVC(C = 100, degree = 1, gamma = "auto", probability=True, kernel = "rbf", random_state=42)

# Intitializing Logistic Regression Classifier
Logistic_Regression = LogisticRegression(C=1, max_iter=500, penalty='l2', random_state=42, n_jobs = -1)

# Intitializing KNN Classifier
KNN = KNeighborsClassifier(n_neighbors=3) 

# Intitializing Decision Tree Classifier
Decision_Tree = DecisionTreeClassifier(criterion='gini', max_depth=18, min_samples_split=2)

# Intitializing Extra_Tree Classifier
Extra_Tree = ExtraTreesClassifier(criterion='gini', max_depth=200, n_estimators=500, n_jobs=-1)

# Initializing Random Forest classifier
Random_Forest = RandomForestClassifier(n_estimators = 500, criterion = "gini", max_depth = 10, max_features = "auto", n_jobs = -1, random_state = 42)

# Initializing XGBOOST  classifier
XGB = XGBClassifier(depth=10, learning_rate =0.7, iterations=200, n_jobs=-1,random_state=42)

# Initializing CatBoost classifier
CatBoost = CatBoostClassifier(depth= 10, learning_rate =0.6, iterations=150, random_state=42, verbose=0)

# Ini-tializing Light GBM classifier
Light_GBM = LGBMClassifier(max_depth= 10, min_data_in_leaf = 2, num_leaves=50, n_jobs=-1, random_state=42)

classifiers_dict = {"SVC-RBF": SVC_RBF,
               "XGB": XGB,
               "SVC_Linear": SVC_Linear,
               "CatBoost": CatBoost,
               "Light_GBM": Light_GBM,
               "Random_Forest": Random_Forest,
               "Logistic_Regression": Logistic_Regression,}

base_models_list = [SVC_Linear, SVC_RBF, SVC_Poly, XGB, CatBoost, KNN, Decision_Tree, Extra_Tree, Random_Forest, Light_GBM, Logistic_Regression,]

##### c. Function for the custom stacking classifier

In [31]:
def stacking_classifier(k_model, base_models, meta_model, D1, D2, y_D1, y_D2, test, y_te):
  """This function performs custom stacking classification as it takes no. of base models, then the output of the base models will the added to the another dataset and that dataset used to train the meta classifier""" 
  base_model_fits = []
  predictions = pd.DataFrame()
  predictions_test = pd.DataFrame()
  for i in range(k_model):
      base_model = base_models[i]

      # Get a random sample with replacement with a size of 1000 from D1 
      train_sample, y_sample = resample(D1, y_D1, n_samples=1000, stratify=y_D1, random_state=42)

      base_model.fit(train_sample, y_sample) # train the model on sample
      base_model_fits.append(base_model) # save the base model

  for j in range(k_model): # send D2 to all base models
      y_pred = base_model_fits[j].predict_proba(D2) #predict  the probablities of the classes for D2 set
      for c in range(len(y_pred[0])):
          predictions[f"{j}{c}"] = y_pred[0:,c] # store the probablities of each class 

  for m in range(k_model): # send test data to all base models
      y_pred_test = base_model_fits[m].predict_proba(test) #predict  the probablities of the classes for test set
      for p in range(len(y_pred_test[0])):
          predictions_test[f"{m}{p}"] = y_pred_test[0:,p]  # store the probablities of each class
        
  meta_data_train = hstack((D2, predictions)) # stacking the D2 data and prediction probablities from k base model for meta classifier
  meta_data_test = hstack((test, predictions_test)) # stacking the train data and k prediction for test set

  
  meta_model.fit(meta_data_train, y_D2) # train the meta model on meta data
  print("F1-Score for {} as Meta classifier and {} number of base model is {}".format('Logistic_Regression', K+1, np.round(f1_score(y_te, meta_model.predict(meta_data_test), average='macro'), 6)))
  #plot_confusion_matrix(y_te, meta_classifier.predict(Test_new))

        
        
for K in range(len(base_models_list)):
  stacking_classifier(K+1, base_models_list, Logistic_Regression, Train_D1, Train_D2, y_train_D1, y_train_D2, Test, y_test)


F1-Score for Logistic_Regression as Meta classifier and 1 number of base model is 0.810756
F1-Score for Logistic_Regression as Meta classifier and 2 number of base model is 0.80566
F1-Score for Logistic_Regression as Meta classifier and 3 number of base model is 0.80566
F1-Score for Logistic_Regression as Meta classifier and 4 number of base model is 0.821916
F1-Score for Logistic_Regression as Meta classifier and 5 number of base model is 0.823512
F1-Score for Logistic_Regression as Meta classifier and 6 number of base model is 0.826029
F1-Score for Logistic_Regression as Meta classifier and 7 number of base model is 0.815016
F1-Score for Logistic_Regression as Meta classifier and 8 number of base model is 0.825035
F1-Score for Logistic_Regression as Meta classifier and 9 number of base model is 0.825035




F1-Score for Logistic_Regression as Meta classifier and 10 number of base model is 0.81249




F1-Score for Logistic_Regression as Meta classifier and 11 number of base model is 0.817237




> From above experiments we can conclude that number of base is 6 gives the highest F1-Score. So we can move ahead with 6 number of base model in our custom stacking classifier.


