In [1]:
import pandas as pd
import numpy as np
import re
from string import punctuation
import unicodedata
import preprocessor as p
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import joblib

In [2]:
class Classification():
    
     # ----------------------------------------- Constructor -----------------------------------------
    
    def __init__(self):
        self.punctuation = set(punctuation)
        self.lemmatizer = WordNetLemmatizer()
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
        self.stopword_list = set(stopwords.words('english'))
        unwanted_stopwords = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', 'what', 'which', 'who',
                              'whom', 'why', 'how', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                              "shouldn't", 'wasn',"wasn't",'weren', "weren't", 'won', "won't", 'wouldn',
                              "wouldn't", 'don', "don't"}

        self.stopword_list = [x for x in self.stopword_list if x not in unwanted_stopwords]
        
        
    # ---------------------------------------- Read Data ----------------------------------------
    
    def read_data(self, path):
        df = pd.read_csv(path, usecols=['tweet', 'label'])
        df = df[pd.notnull(df.tweet)]
        df = df.sample(frac=1)
        return df
    
    
     # ----------------------------------------- Clean Data -----------------------------------------
    
    def clean_data(self, tweets):
        cleaned_tweets = []
        for text in tweets:
            
            # Clean tweet
            text = p.clean(text)
            
            # Remove special characters
            text = re.sub(r'(\\x(.)*)', '',text)
            text = re.sub(r'\\n|\\t|\\n\\n', ' ', text)
            text = re.sub(r"b'RT|b'|b RT|b\"RT", "", text)
            text = re.sub("[@#$%^&*)(}{|/><=+=_:\"\\\\]+"," ",text).strip()
            
            #Remove punctuation marks
            text = "".join(x for x in text if x not in self.punctuation)
            
            # Remove accented words
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            
            # Splitting Hashtag words
            text = " ".join([x for x in re.split('([A-Z][a-z]+)', text) if x])
            
            # Remove long spaces
            pattern = r'^\s*|\s\s*'
            text = re.sub(pattern, ' ', text).strip()
            
            # Remove numbers
            text = re.sub('[0-9]+', '', text)
            
            cleaned_tweets.append(text)
        
        return cleaned_tweets
    
    
    # ----------------------------------------- Preprocess Data -----------------------------------------
    
    def preprocess_data(self, tweets):
        preprocessed_tweets = []
        for text in tweets:
            
            # Remove stopwords
            text = " ".join(x for x in text.lower().split() if x not in self.stopword_list)
            
            # Text Lemmatization
            lemmatized_words = []
            for word in text.split():
                word1 = self.lemmatizer.lemmatize(word, pos="n")
                word2 = self.lemmatizer.lemmatize(word1, pos="v")
                word3 = self.lemmatizer.lemmatize(word2, pos=("a"))
                lemmatized_words.append(word3)
            text = " ".join(x for x in lemmatized_words)
            
            preprocessed_tweets.append(text)
            
        return preprocessed_tweets
    
    
    # ------------------------------ Word-level unigram TF-IDF Vectorization ------------------------------
    
    def tfidf_vectorize(self, X_train, x_test):
        tfidf_vec = TfidfVectorizer(sublinear_tf=True, min_df=3, norm='l2', stop_words='english')
        tfidf_vec.fit(X_train)
        X_train_tfidf = tfidf_vec.transform(X_train).toarray()  
        X_test_tfidf = tfidf_vec.transform(X_test).toarray()      
        return tfidf_vec, X_train_tfidf, X_test_tfidf
    
    
    # ------------------------------------ Train Model ------------------------------------
    
    def train_model(self, classifier, X_train, X_test, y_train, y_test):
        model = classifier.fit(X_train, y_train)
        results = model.predict(X_test)
        return model, metrics.accuracy_score(results, y_test)
    

In [3]:
cl = Classification()

In [4]:
path = "dataset/combined_dataset/data_main.csv"
data = cl.read_data(path)
data = data.sample(frac=1)
data

Unnamed: 0,tweet,label
8556,never look anybody unless youre help life quote,0.0
4004,great terry fox pathetic conclude no support k...,1.0
4590,algeria,1.0
4791,great terry fox pathetic annoy khalistan refer...,1.0
8972,marvelous positive affirmation,0.0
...,...,...
8454,hey love music really not youre perform murder...,0.0
6281,anyone else morning meet amp veep,0.0
4521,b khalistan answer india anti farmer bill sfj ...,1.0
6852,ever felt like nba fix bamboosled market nbafi...,0.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.tweet.values.tolist(), 
                                                    data.label,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42)

In [6]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

7370
7370
3159
3159


In [7]:
tfidf_vec, X_train_tfidf, X_test_tfidf = cl.tfidf_vectorize(X_train, X_test)

In [8]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(7370, 2167)
(3159, 2167)


In [20]:
# Naive Bayes on Word Level TF IDF Vectors
NB_model, NB_accuracy = cl.train_model(MultinomialNB(), X_train_tfidf, X_test_tfidf, y_train, y_test)
print("Naive Bayes, Count Vectors: ", NB_accuracy*100)

Naive Bayes, Count Vectors:  98.32225387780943


In [None]:
joblib.dump(NB_model, "models/NB_tfidf.pk1")

In [21]:
# Naive Bayes on Word Level TF IDF Vectors
LinearSVC_model, LinearSVC_accuracy = cl.train_model(LinearSVC(), X_train_tfidf, X_test_tfidf, y_train, y_test)
print("Naive Bayes, Count Vectors: ", LinearSVC_accuracy*100)

Naive Bayes, Count Vectors:  99.08198797087687


In [None]:
joblib.dump(LinearSVC_model, "models/LinearSVC_tfidf.pk1")

In [11]:
test_set = ["#Punjab should be given its #Freedom as soon as possible. #Khalistan #Referendum2020 #FreePunjab",
            "Donald Trump would not win this year's elections. #Trump",
           "ISRO makes a giant leap forward by sending satellites to mars. #ISRO #MissionMangal",
            "The new Pime minister fellowship program will benefit many students.",
            "The #Khalistan movement is gaining momentum.. #India is falling apart.",
           "The  so-called pure country called Pakistan is killing, murdering, blasting, and commuting inhuman atrocities on their own Muslim brotherhood. Do the Sikhs want the same treatment meted out for themselves? #Khalistan is just an anti-India agenda of Pakistan, stop demanding it"]


In [12]:
cleaned_test_set = cl.clean_data(test_set)
preprocessed_test_set = cl.preprocess_data(cleaned_test_set)
tfidf_test_set = tfidf_vec.transform(preprocessed_test_set).toarray()
tfidf_test_set.shape

(6, 2167)

In [18]:
NB_result = NB_model.predict(tfidf_test_set)
for r in NB_result:
    if r == 1:
        print("Khalistan")
    if r == 0:
        print("General")

Khalistan
General
General
General
Khalistan
Khalistan


In [19]:
LinearSVC_result = LinearSVC_model.predict(tfidf_test_set)
for r in LinearSVC_result:
    if r == 1:
        print("Khalistan")
    if r == 0:
        print("General")

Khalistan
General
General
General
Khalistan
Khalistan
