In [101]:
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import preprocess_string
import gensim.parsing.preprocessing as gsp
from sklearn import utils as skutils
from gensim import utils
from tqdm import tqdm
import multiprocessing
import nltk
import re
from collections import Counter, defaultdict
from nltk.corpus import stopwords


In [174]:
Train_csv = "D:\\Courses\\Sem 8 2021-22\\COL865\\Project Dataset\\CodaLab\\Constraint_English_Train - Sheet1.csv"
Test_csv = "D:\\Courses\\Sem 8 2021-22\\COL865\\Project Dataset\\CodaLab\\english_test_with_labels - Sheet1.csv"

In [175]:
df = pd.read_csv(Train_csv)

In [103]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [104]:
def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=[]):
    

    string = re.sub(r'https?://\S+|www\.\S+', '', string)


    string = re.sub(r'<.*?>', '', string)


    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, " ") 

    string = string.lower()


    string = ' '.join([word for word in string.split() if word not in stop_words])

    string = re.sub(r'\s+', ' ', string).strip()

    return string   

In [105]:
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [106]:
def normalize(X):
    mean = np.mean(X)
    stddev = np.std(X)
    return (X - mean) * (1 / stddev)

DOC2VEC Transformer

In [109]:
class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(remove_emojis(row)).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skutils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))

In [124]:
# way to use doc2vec transformer

# can i train it on a a  larger corpus ?
# look into it.
df_x = df['tweet']
doc2vec = Doc2VecTransformer(vector_size = 2000)
doc2vec_model =  doc2vec.fit(df_x)
doc2vec_features = doc2vec_model.transform(df_x)

100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 3210232.68it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 3206410.06it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2144586.79it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 3238416.32it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 2138455.50it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 1283481.01it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 1620377.40it/s]
100%|█████████████████████████████████████████████████████████████████████████| 6420/6420 [00:00<00:00, 3204120.86it/s]
100%|███████████████████████████████████

In [194]:
vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
def dataset(pathname, model = 'TF_IDF', default = 'test', vectorizer = vectorizer):
    
    """
    model : TF_IDF or Doc_term or Doc2Vec
    default : train or test
    """
    
    df = pd.read_csv(pathname)

    if(model == 'TF_IDF' or model == 'Doc_term'):
        corpus = []
        for x in df['tweet']:
            corpus.append(clean_text(remove_emojis(x)))
        
        if(default == 'train'):
            X = vectorizer.fit_transform(corpus)
        elif(default == 'test'):
            X = vectorizer.transform(corpus)
        
        Y = np.array([1 if y == 'real' else 0 for y in df['label']])
        
        #print(X.shape, Y.shape)   
        return X, Y
    
    elif(model == 'Doc2Vec'):
        if(default == 'train'):
            X = doc2vec_model.transform(df['tweet'])
        elif(default == 'test'):
            X = doc2vec_model.transform(df['tweet'])

        Y = [1 if y == 'real' else 0 for y in df['label']]
        
        pca = PCA(n_components=500)
        X = pca.fit_transform(X)
        print(X.shape)      
        return X, Y
        
        

# Doc2Vec features

In [184]:
X_train, Y_train = dataset(Train_csv , model = 'Doc2Vec', default = 'train')

(6420, 500)


In [164]:
from sklearn.linear_model import LogisticRegression

In [165]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [166]:
y_pred = [int(clf.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [167]:
def accuracy(Y, y):
    acc = 0
    for i in range(len(Y)):
        if Y[i] == y[i]:
            acc +=1
    return acc/len(Y)
            

In [168]:
accuracy(Y_train, y_pred)

0.5347352024922118

In [176]:
X_test, y_test =  dataset(Test_csv, model = 'Doc2Vec' , default = 'test')

(2140, 500)


In [177]:
ytest_pred = [int(clf.predict(X_test[i, :].reshape(1, -1))) for i in range(len(X_test))]

In [178]:
accuracy(y_test, ytest_pred)

0.530373831775701

# TF IDF features + Logistic Regression

In [207]:
X_train, Y_train = dataset(Train_csv , model = 'TF_IDF', default = 'train')
X_test, Y_test =  dataset(Test_csv, model = 'TF_IDF' , default = 'test')

In [208]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
y_pred = [int(clf.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [209]:
# training accuracy
accuracy(Y_train, y_pred)

0.9947040498442368

In [210]:
ytest_pred = [int(clf.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.9257009345794392

In [None]:
#tune hyperparameters

# TF IDF features + Random Forest

In [204]:
from sklearn.ensemble import RandomForestClassifier as rfc

# rfc will take huge time to converge for so many features, may be truncated SVD to decrease number of features
# tune hyper-parameters to improve test accuracy
RFC = rfc(random_state=0).fit(X_train, Y_train)
y_pred = [int(RFC.predict(X_train[i, :].reshape(1, -1))) for i in range(6420)]

In [205]:
# training accuracy
# overfitting clearly !, anyway expected
accuracy(Y_train, y_pred)

1.0

In [206]:
ytest_pred = [int(clf.predict(X_test[i, :].reshape(1, -1))) for i in range(2140)]
accuracy(Y_test, ytest_pred)

0.9233644859813084

# SVMs using various kernels

# Naive Bayes

# XGBOOST

# MLPs (ANN)

Look into RNNs and CNNS