##**Mounting Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##**Import Packages**

In [2]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import string, re
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


##**Loading Dataset**

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset.csv", encoding='utf-8')

In [None]:
dataset.head()

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...


##**Text Pre-Processing**

In [None]:
class TextPreprocessing:
    def __init__(self, df, textColumnName):
        self.m_df = df.copy()
        self.m_textColumnName = textColumnName
        self.m_stopWords = list(stopwords.words('english'))
        self.m_lemmatizer = WordNetLemmatizer()
        self.m_stemmer = PorterStemmer()
        print("Text Cleaning Starts")
        self.LowerText()
        print("\t\tText Lowered")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.CleanPunctuation)
        print("\t\tPunctuation Removed")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.CleanStopWords)
        print("\t\tStopwords Removed")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.CleanUrls)
        print("\t\tUrls Removed")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.CleanHashTags)
        print("\t\tHashTags Removed")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.CleanNumbers)
        print("\t\tNumbers Removed")
        self.m_df[textColumnName] = self.m_df[textColumnName].apply(self.LemmatizeText)
        print("\t\tLemmatization Removed")
        print("Text Cleaning Done\n")
    
    def GetDataFrame(self):
        return self.m_df
    
    def LowerText(self):
        self.m_df[self.m_textColumnName] = [text.lower() for text in self.m_df[self.m_textColumnName]]

    def CleanPunctuation(self, text):
        translationTable = text.maketrans('', '', string.punctuation)
        return text.translate(translationTable)

    def CleanStopWords(self, text):
        wordsInText = text.split()
        wordsInText = [w for w in wordsInText if not w in self.m_stopWords]
        return ' '.join(wordsInText)

    def CleanUrls(self, text):
        return re.sub("http\S+", "", text)
    
    def CleanHashTags(self, text):
        return re.sub("#\S+", "", text)
    
    def CleanNumbers(self, text):
        return re.sub("\d+", "", text)

    def LemmatizeText(self, text):
        temp = []
        for word in text.split():
          temp.append(self.m_lemmatizer.lemmatize(word))
        return ' '.join(temp)
    
    def StemmingText(self, text):
        return ' '.join([self.m_stemmer.stem(word) for word in text.split()])


In [None]:
## It Takes Time, Runs only if you have time
dataset_PreProcessed = TextPreprocessing(dataset, 'statement').GetDataFrame()
dataset_PreProcessed.to_csv("/content/drive/MyDrive/BTP/Files/dataset_PreProcessed.csv", encoding='utf-8', index = False)

Text Cleaning Starts
		Text Lowered
		Punctuation Removed
		Stopwords Removed
		Urls Removed
		HashTags Removed
		Numbers Removed
		Lemmatization Removed
Text Cleaning Done



##**Combining All Data**

In [3]:
dataset_NER = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset_NER.csv", encoding='utf-8')
dataset_PreProcessed = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset_PreProcessed.csv", encoding='utf-8')
dataset_POSTag = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset_POSTag.csv", encoding='utf-8')
dataset_Dependency = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset_Dependency.csv", encoding='utf-8')
dataset_Sentiment = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset_Sentiment.csv", encoding='utf-8')

In [4]:
combined_data =  pd.concat([dataset_PreProcessed, dataset_Sentiment, dataset_Dependency, dataset_POSTag, dataset_NER],axis=1)

##**Test/Train Split and Saving**

In [5]:
train_data, test_data = train_test_split(combined_data, test_size=0.3, random_state=42)
train_data, valid_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [6]:
train_data.shape, test_data.shape, valid_data.shape

((48379, 87), (23039, 87), (5376, 87))

In [7]:
train_data.to_csv("/content/drive/MyDrive/BTP/Files/train_data.csv", encoding='utf-8', index = False)
test_data.to_csv("/content/drive/MyDrive/BTP/Files/test_data.csv", encoding='utf-8', index = False)
valid_data.to_csv("/content/drive/MyDrive/BTP/Files/valid_data.csv", encoding='utf-8', index = False)