In [181]:
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import preprocess_string
import gensim.parsing.preprocessing as gsp
from sklearn import utils as skutils
from gensim import utils
from tqdm import tqdm
import multiprocessing
import nltk
import re
from collections import Counter, defaultdict
from nltk.corpus import stopwords


In [182]:
Train_csv = "D:\\Courses\\Sem 8 2021-22\\COL865\\Project Dataset\\CodaLab\\Constraint_English_Train - Sheet1.csv"
Test_csv = "D:\\Courses\\Sem 8 2021-22\\COL865\\Project Dataset\\CodaLab\\english_test_with_labels - Sheet1.csv"
Val_csv = "D:\\Courses\\Sem 8 2021-22\\COL865\\Project Dataset\\CodaLab\\Constraint_English_Val - Sheet1.csv"

In [183]:
df = pd.read_csv(Train_csv)

In [184]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [185]:
def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=[]):
    

    string = re.sub(r'https?://\S+|www\.\S+', '', string)


    string = re.sub(r'<.*?>', '', string)


    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, " ") 

    string = string.lower()


    string = ' '.join([word for word in string.split() if word not in stop_words])

    string = re.sub(r'\s+', ' ', string).strip()

    return string   

In [186]:
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [187]:
def normalize(X):
    mean = np.mean(X)
    stddev = np.std(X)
    return (X - mean) * (1 / stddev)

DOC2VEC Transformer

In [188]:
class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(remove_emojis(row)).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skutils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [190]:
# way to use doc2vec transformer

# can i train it on a a  larger corpus ?
# look into it.
df_x = df['tweet']
doc2vec = Doc2VecTransformer(vector_size = 2000)
doc2vec_model =  doc2vec.fit(df_x)

100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2141176.18it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2143050.67it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2136589.04it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2137097.75it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 3216367.85it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2170691.79it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2117602.37it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2142539.12it/s]
100%|███████████████████████████████████

In [191]:
#for TF-IDF 
#unigram
vectorizer = TfidfVectorizer(ngram_range = (1,1))

# # bigram
# vectorizer = TfidfVectorizer(ngram_range = (2,2))

# # unigram + bigram
# vectorizer = TfidfVectorizer(ngram_range = (2,2))


#for Count Vectorizer
# unigram
vectorizer = CountVectorizer(ngram_range = (1,1))


# performance has been best for CountVectorizer

def dataset(pathname, model = 'TF_IDF', default = 'test', vectorizer = vectorizer):
    
    """
    model : TF_IDF or Doc_term or Doc2Vec
    default : train or test
    """
    
    df = pd.read_csv(pathname)

    if(model == 'TF_IDF' or model == 'Doc_term'):
        corpus = []
        for x in df['tweet']:
            corpus.append(clean_text(remove_emojis(x)))
        
        if(default == 'train'):
            X = vectorizer.fit_transform(corpus)
        elif(default == 'test'):
            X = vectorizer.transform(corpus)
        
        Y = np.array([1 if y == 'real' else 0 for y in df['label']])
        
        #print(X.shape, Y.shape)   
        return X, Y
    
    elif(model == 'Doc2Vec'):
        if(default == 'train'):
            X = doc2vec_model.transform(df['tweet'])
        elif(default == 'test'):
            X = doc2vec_model.transform(df['tweet'])

        Y = [1 if y == 'real' else 0 for y in df['label']]
        
        pca = PCA(n_components=500)
        X = pca.fit_transform(X)
        print(X.shape)      
        return X, Y
        
        

# Doc2Vec features

In [192]:
X_train, Y_train = dataset(Train_csv , model = 'Doc2Vec', default = 'train')

(6420, 500)


In [193]:
from sklearn.linear_model import LogisticRegression

In [194]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [195]:
y_pred = [int(clf.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [196]:
def accuracy(Y, y):
    acc = 0
    for i in range(len(Y)):
        if Y[i] == y[i]:
            acc +=1
    return acc/len(Y)
            

In [197]:
accuracy(Y_train, y_pred)

0.5308411214953271

In [198]:
X_test, y_test =  dataset(Test_csv, model = 'Doc2Vec' , default = 'test')

(2140, 500)


In [199]:
ytest_pred = [int(clf.predict(X_test[i, :].reshape(1, -1))) for i in range(len(X_test))]

In [200]:
accuracy(y_test, ytest_pred)

0.5275700934579439

# Logistic Regression

In [201]:
X_train, Y_train = dataset(Train_csv , model = 'TF_IDF', default = 'train')
X_test, Y_test =  dataset(Val_csv, model = 'TF_IDF' , default = 'test')

In [202]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
y_pred = [int(clf.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [203]:
# training accuracy
accuracy(Y_train, y_pred)

0.9947040498442368

In [204]:
ytest_pred = [int(clf.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.9219626168224299

In [205]:
#tune hyperparameters

# Random Forest

In [206]:
from sklearn.ensemble import RandomForestClassifier as rfc

# rfc will take huge time to converge for so many features, may be truncated SVD to decrease number of features
# tune hyper-parameters to improve test accuracy
RFC = rfc(random_state=0).fit(X_train, Y_train)
y_pred = [int(RFC.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [207]:
# training accuracy
# overfitting clearly !, anyway expected
accuracy(Y_train, y_pred)

1.0

In [208]:
ytest_pred = [int(RFC.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.922429906542056

Grid Search for Random Forest

In [134]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = rfc()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [135]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  4.0min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [136]:
grid_search.best_params_
best_grid = grid_search.best_estimator_


In [137]:
y_pred = [int(best_grid.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [None]:
accuracy(Y_train, y_pred)

# SVMs using various kernels

# Naive Bayes

# XGBOOST

In [209]:
import xgboost as xgb
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'logloss',
 nthread=4,
 scale_pos_weight=1,
 seed=27)


xgb1 = xgb.XGBClassifier()

xgbmodel = xgb1.fit(X_train, Y_train)
y_pred = [int(xgbmodel.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]






In [210]:
# training accuracy
accuracy(Y_train, y_pred)

0.9537383177570093

In [211]:
ytest_pred = [int(xgbmodel.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.9098130841121496

# MLPs (ANN)

In [216]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes = (128, 64) , random_state = 1,  max_iter = 300, learning_rate = 'adaptive')

In [217]:
mlpmodel = mlp.fit(X_train, Y_train)
y_pred = [int(mlpmodel.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [218]:
# training accuracy
accuracy(Y_train, y_pred)

1.0

In [219]:
ytest_pred = [int(mlpmodel.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.9285046728971963

Look into RNNs and CNNS