In [18]:
import numpy as np
import pandas as pd
import re
from string import punctuation
from time import process_time
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import preprocessor as p

In [57]:
class Preprocess_Data():
    
    # ----------------------------------------- Constructor -----------------------------------------
    
    def __init__(self):
        self.punctuation = set(punctuation)
        self.lemmatizer = WordNetLemmatizer()
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
        self.stopword_list = set(stopwords.words('english'))
        unwanted_stopwords = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', 'what', 'which', 'who',
                              'whom', 'why', 'how', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                              "shouldn't", 'wasn',"wasn't",'weren', "weren't", 'won', "won't", 'wouldn',
                              "wouldn't", 'don', "don't"}

        self.stopword_list = [x for x in self.stopword_list if x not in unwanted_stopwords]
       
    
    # ----------------------------------------- Read Data -----------------------------------------
    
    def read_data(self, path):
        df = pd.read_csv(path, usecols=['user_id', 'tweet'])
        return df
    
    
    # ----------------------------------------- Clean Data -----------------------------------------
    
    def clean_data(self, tweets):
        cleaned_tweets = []
        for text in tweets:
            
            # Clean tweet
            text = p.clean(text)
            
            # Remove special characters
            text = re.sub(r'(\\x(.)*)', '',text)
            text = re.sub(r'\\n|\\t|\\n\\n', ' ', text)
            text = re.sub(r"b'RT|b'|b RT|b\"RT|b", "", text)
            text = re.sub("[@#$%^&*)(}{|/><=+=_:\"\\\\]+"," ",text).strip()
            
            #Remove punctuation marks
            text = "".join(x for x in text if x not in self.punctuation)
            
            # Remove accented words
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            
            # Splitting Hashtag words
            text = " ".join([x for x in re.split('([A-Z][a-z]+)', text) if x])
            
            # Remove long spaces
            pattern = r'^\s*|\s\s*'
            text = re.sub(pattern, ' ', text).strip()
            
            # Remove numbers
            text = re.sub('[0-9]+', '', text)
            
            cleaned_tweets.append(text)
        
        return cleaned_tweets
    
    
    # ----------------------------------------- Preprocess Data -----------------------------------------
    
    def preprocess_data(self, tweets):
        preprocessed_tweets = []
        for text in tweets:
            
            # Remove stopwords
            text = " ".join(x for x in text.lower().split() if x not in self.stopword_list)
            
            # Text Lemmatization
            lemmatized_words = []
            for word in text.split():
                word1 = self.lemmatizer.lemmatize(word, pos="n")
                word2 = self.lemmatizer.lemmatize(word1, pos="v")
                word3 = self.lemmatizer.lemmatize(word2, pos=("a"))
                lemmatized_words.append(word3)
            text = " ".join(x for x in lemmatized_words)
            
            preprocessed_tweets.append(text)
            
        return preprocessed_tweets

In [58]:
pre = Preprocess_Data()

In [97]:
input_path = "dataset/raw_dataset/khalistan_main.csv"
output_path = "dataset/cleaned_dataset/khalistan_cleaned.csv"

path1 = "../Anti-National-Tweets-Classification/dataset/raw_dataset/control_tweets.csv"
path2 = "../Anti-National-Tweets-Classification/dataset/raw_dataset/khalistan_main.csv"
path3 = "../Anti-National-Tweets-Classification/dataset/raw_dataset/Referendum2020.csv"

# data1 = pre.read_data(input_path)
# data
d1 = pd.read_csv(path1)
d2 = pre.read_data(path2)
d3 = pre.read_data(path3)

In [98]:
# raw_tweets = data.tweet.values.tolist()
# raw_tweets[:2]
r1 = d1.tweet_low.values.tolist()
r2 = d2.tweet.values.tolist()
r3 = d3.tweet.values.tolist()

In [99]:
# cleaned_tweets = pre.clean_data(raw_tweets)
# cleaned_tweets

c1 = pre.clean_data(r1)
c2 = pre.clean_data(r2)
c3 = pre.clean_data(r3)

In [100]:
#Save cleaned

In [101]:
# preprocess_tweets = pre.preprocess_data(cleaned_tweets)
# preprocess_tweets

p1 = pre.preprocess_data(c1)
p2 = pre.preprocess_data(c2)
p3 = pre.preprocess_data(c3)

In [102]:
p2

['pil file twitter promote khalistan good initiative sikh community try sa',
 '',
 'pakistan around partition weve never see photograph colour ehsan rehan first photograph gurdwara',
 'pannun no shame leave',
 'pannun pakistani puppet who work pakistan see indian sikh answer referendum shame',
 'pil file twitter promote khalistan good initiative sikh community try sa',
 '',
 'pakistan around partition weve never see photograph colour ehsan rehan first photograph gurdwara',
 'pannun no shame leave',
 'pannun pakistani puppet who work pakistan see indian sikh answer referendum shame',
 'no khalistan khai garbage dear sikh quick realise pannun',
 'pannun not listen real sikh imaginary world doesnt care sikh community',
 '',
 'sikh justice mislead sikh community name khalistan support pakistan',
 'gherao sept truck rally day modi solution farmersbill sfj khalistan register',
 'sikh justice mislead sikh community name khalistan support pakistan',
 'pig india no no not regular pig who demand

In [None]:
#Save preprocessed

In [65]:
print(type(p1))

<class 'list'>


In [103]:
f1 = pd.DataFrame(columns = ['tweet','hashtags','label'])
f2 = pd.DataFrame(columns = ['tweet','hashtags','label'])

In [104]:
f1['tweet'] = pd.Series(p1)
f1.hashtags = d1.tweet_low.apply(lambda x: re.findall(r"#(\w+)", x))
f1.label = 0

In [105]:
f1.head()


Unnamed: 0,tweet,hashtags,label
0,day go gettingthere,[gettingthere],0
1,friend tell shes afraid go dc rally amp attack...,[berniebros],0
2,ive notice lot icontf presentation mention hap...,[icontf16],0
3,get weekend show country music lylepierce,"[country, music, lylepierce]",0
4,final legislative session day officially begin...,[albany],0


In [106]:
print(type(p2),type(p3))

p2.extend(p3)


h1 = d2.tweet.apply(lambda x: re.findall(r"#(\w+)", x))
h2 = d3.tweet.apply(lambda x: re.findall(r"#(\w+)", x))
htemp = h1.append(h2,ignore_index = True)

f2.tweet = pd.Series(p2)
f2.hashtags = pd.Series(htemp)
f2.label = 1

<class 'list'> <class 'list'>


In [108]:
print(len(p2))

5289


In [107]:
f2

Unnamed: 0,tweet,hashtags,label
0,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",1
1,,"[Kashmir_With_India, ShameOnSFJ, ShameOnPannun...",1
2,pakistan around partition weve never see photo...,"[Pakistan, Gurdwara]",1
3,pannun no shame leave,"[Pannun, Khalistan]",1
4,pannun pakistani puppet who work pakistan see ...,[Shame],1
...,...,...,...
5284,referendum vote,[Referendum2020],1
5285,part democracy take minute referendum,[Referendum2020],1
5286,true face khalistan not sikh puppet isi shame ...,"[Khalistan, Sikhs, ISI, ShameOnSFJ, ShameOnPan...",1
5287,sikh r suffer pakistan amp isi busy support kh...,"[Pakistan, ISI, Khalistan]",1


In [109]:
f2.to_csv('../Anti-National-Tweets-Classification/dataset/pre_processed/postive_class.csv')
f1.to_csv('../Anti-National-Tweets-Classification/dataset/pre_processed/negative_class.csv')

In [54]:
df2 = pd.read_csv('../Anti-National-Tweets-Classification/dataset/pre_processed/postive_class.csv')

In [43]:
# df2 = pd.DataFrame(columns = ['tweet','hashtags','label'])
df2['tweet'] = pd.Series(preprocess_tweets)
df2.hashtags = data.tweet.apply(lambda x: re.findall(r"#(\w+)", x))
df2.label = 1

In [44]:
df2.head()

Unnamed: 0,tweet,hashtags,label
0,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",1
1,khalistan voice sikh never forget indian state...,[Khalistan],1
2,sikh justice mislead sikh community name khali...,[Khalistan],1
3,sikh community world include india protest hin...,[Refer],1
4,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",1


In [45]:
df2.to_csv('../Anti-National-Tweets-Classification/dataset/pre_processed/postive_class.csv')