In [1]:
import numpy as np
import pandas as pd
import re
from string import punctuation
from time import process_time
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import preprocessor as p

In [2]:
class Preprocess_Data():
    
    # ----------------------------------------- Constructor -----------------------------------------
    
    def __init__(self):
        self.punctuation = set(punctuation)
        self.lemmatizer = WordNetLemmatizer()
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
        self.stopword_list = set(stopwords.words('english'))
        unwanted_stopwords = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', 'what', 'which', 'who',
                              'whom', 'why', 'how', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                              "shouldn't", 'wasn',"wasn't",'weren', "weren't", 'won', "won't", 'wouldn',
                              "wouldn't", 'don', "don't"}

        self.stopword_list = [x for x in self.stopword_list if x not in unwanted_stopwords]
       
    
    # ----------------------------------------- Read Data -----------------------------------------
    
    def read_data(self, path):
        df = pd.read_csv(path, usecols=['user_id','created_at', 'tweet'])
        return df
    
    
    # ----------------------------------------- Clean Data -----------------------------------------
    
    def clean_data(self, tweets):
        cleaned_tweets = []
        for text in tweets:
            
            # Clean tweet
            text = p.clean(text)
            
            # Remove special characters
            text = re.sub(r'(\\x(.)*)', '',text)
            text = re.sub(r'\\n|\\t|\\n\\n', ' ', text)
            text = re.sub(r"b'RT|b'|b RT|b\"RT", "", text)
            text = re.sub("[@#$%^&*)(}{|/><=+=_:\"\\\\]+"," ",text).strip()
            
            #Remove punctuation marks
            text = "".join(x for x in text if x not in self.punctuation)
            
            # Remove accented words
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            
            # Splitting Hashtag words
            text = " ".join([x for x in re.split('([A-Z][a-z]+)', text) if x])
            
            # Remove long spaces
            pattern = r'^\s*|\s\s*'
            text = re.sub(pattern, ' ', text).strip()
            
            # Remove numbers
            text = re.sub('[0-9]+', '', text)
            
            cleaned_tweets.append(text)
        
        return cleaned_tweets
    
    
    # ----------------------------------------- Preprocess Data -----------------------------------------
    
    def preprocess_data(self, tweets):
        preprocessed_tweets = []
        for text in tweets:
            
            # Remove stopwords
            text = " ".join(x for x in text.lower().split() if x not in self.stopword_list)
            
            # Text Lemmatization
            lemmatized_words = []
            for word in text.split():
                word1 = self.lemmatizer.lemmatize(word, pos="n")
                word2 = self.lemmatizer.lemmatize(word1, pos="v")
                word3 = self.lemmatizer.lemmatize(word2, pos=("a"))
                lemmatized_words.append(word3)
            text = " ".join(x for x in lemmatized_words)
            
            preprocessed_tweets.append(text)
            
        return preprocessed_tweets

In [3]:
pre = Preprocess_Data()

In [4]:
input_path = "dataset/raw_dataset/khalistan.csv"
output_path = "dataset/cleaned_dataset/khalistan_cleaned.csv"

data = pre.read_data(input_path)
data

Unnamed: 0,user_id,created_at,tweet
0,b'SinghGurprabh',2020-09-29 07:46:24,b'RT @ChopraDilpreet: This #PIL filed against ...
1,b'Gurpreet246890',2020-09-29 07:44:02,b'@majorgauravarya #Khalistan is Voice of Sikh...
2,b'Gurpree91384693',2020-09-29 07:33:52,"b""RT @Jas_preet1984: Sikhs For Justice has bee..."
3,b'Jamshaid_2012',2020-09-29 07:19:54,b'RT @Suleman2552: The Sikh community all over...
4,b'ChopraDilpreet',2020-09-29 07:08:52,b'This #PIL filed against #Twitter for promoti...
5,b'jmez1010',2020-09-29 06:53:52,"b""RT @Jasleen_Kaur11: \xf0\x9f\xa4\xa3\xf0\x9f..."
6,b'jmez1010',2020-09-29 06:51:23,"b""RT @DarrenVirk: #Pakistan around partition a..."
7,b'jmez1010',2020-09-29 06:48:21,b'RT @Harbaks21769227: #Pannun has no shame le...
8,b'jmez1010',2020-09-29 06:39:49,"b""RT @Jasleen_Kaur11: Pannun, a Pakistani pupp..."
9,b'jmez1010',2020-09-29 06:38:54,"b'RT @Preetka57645405: There is no #Khalistan,..."


In [5]:
raw_tweets = data.tweet.values.tolist()
raw_tweets[:2]

["b'RT @ChopraDilpreet: This #PIL filed against #Twitter for promoting #Khalistan is a good initiative. #SikhCommunity is just trying to save t\\xe2\\x80\\xa6'",
 "b'@majorgauravarya #Khalistan is Voice of Sikhs.\\nNever forget when the Indian State killed 35 Sikhs in the chatti sin\\xe2\\x80\\xa6 https://t.co/F5msNWpAkn'"]

In [6]:
cleaned_tweets = pre.clean_data(raw_tweets)
cleaned_tweets

['This PIL filed against Twitter for promoting Khalistan is a good initiative Sikh Community is just trying to save t',
 'Khalistan is Voice of Sikhs Never forget when the Indian State killed  Sikhs in the chatti sin',
 'Sikhs For Justice has been misleading the Sikh Community in the name of Khalistan with the support of Pakistan Its an',
 'The Sikh community all over in the world including India are protesting against Hindutva and RSS terrorism And the Refer',
 'This PIL filed against Twitter for promoting Khalistan is a good initiative Sikh Community is just trying to sa',
 '',
 'Pakistan around partition as weve never seen it before Photographs coloured by Ehsan Rehan First photograph is Gurdwara',
 'Pannun has no shame left',
 'Pannun a Pakistani puppet who works for Pakistan see this Indian Sikhs answer to the Referendum  for you Shame',
 'There is no Khalistan there is only Khai Garbage Dear Sikhs the quicker you realise Pannun',
 'Pannun is not listening to Real Sikhs He is in h

In [7]:
preprocess_tweets = pre.preprocess_data(cleaned_tweets)
preprocess_tweets

['pil file twitter promote khalistan good initiative sikh community try save',
 'khalistan voice sikh never forget indian state kill sikh chatti sin',
 'sikh justice mislead sikh community name khalistan support pakistan',
 'sikh community world include india protest hindutva r terrorism refer',
 'pil file twitter promote khalistan good initiative sikh community try sa',
 '',
 'pakistan around partition weve never see photograph colour ehsan rehan first photograph gurdwara',
 'pannun no shame leave',
 'pannun pakistani puppet who work pakistan see indian sikh answer referendum shame',
 'no khalistan khai garbage dear sikh quick realise pannun',
 'pannun not listen real sikh imaginary world doesnt care sikh community',
 '',
 'sikh justice mislead sikh community name khalistan support pakistan',
 'gherao sept truck rally day modi solution farmersbill sfj khalistan register',
 'sikh justice mislead sikh community name khalistan support pakistan',
 'pig india no no not regular pig who dema