##**Mounting Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##**Import Packages**

In [None]:
!pip install vaderSentiment
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 25.3 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 17.7 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 17.7 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 13.3 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 11.7 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 13.5 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 12.1 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 13.4 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 13.0 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 13.5 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 13.5 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 13.5 MB/s eta 0:00:01[K     |████████████████████████████████| 125 

##**Loading Dataset**

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/BTP/Files/dataset.csv", encoding='utf-8')

###*Name Entity Features Extraction*

In [None]:
class NERFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("NER Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_NER = spacy.load("en_core_web_sm")
        self.m_NER_features = ["PERSON","ORG","FAC","GPE","NORP","LOC","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE",
                            "DATE","TIME","PERCENT","MONEY","CARDINAL","QUANTITY","ORDINAL"]
        self.NER()
        print("NER Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def NER(self):
        ner = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_NER(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_NER_features,0)
            labels = [x.label_ for x in sentence.ents]
            dic.update(Counter(labels))
            ner.append(dic)
        ner_df = pd.DataFrame.from_dict(ner)
        self.m_new_df = ner_df


In [None]:
## Extraction takes time, Run only if have time

dataset_NER = NERFeatureExtraction(dataset, 'statement').GetDataFrame()
dataset_NER.to_csv('/content/drive/MyDrive/BTP/Files/dataset_NER.csv', encoding='utf-8', index = False)

###*Part of Speech Features Extraction*

In [None]:
class POSTagFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("POS Tag Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_POS = spacy.load("en_core_web_sm")
        self.m_POS_features = [ "ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART",
                            "PRON","X","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","CONJ"]
        self.POS()
        print("POS Tag Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def POS(self):
        pos_tag = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_POS(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_POS_features,0)
            labels = [x.pos_ for x in sentence]
            dic.update(Counter(labels))
            pos_tag.append(dic)
        pos_df = pd.DataFrame.from_dict(pos_tag)
        self.m_new_df = pos_df


In [None]:
## Extraction takes time, Run only if have time

dataset_POSTag = POSTagFeatureExtraction(dataset, 'statement').GetDataFrame()
dataset_POSTag.to_csv('/content/drive/MyDrive/BTP/Files/dataset_POSTag.csv', encoding='utf-8', index = False)

###*Dependencies Features Extraction*

In [None]:
class DependencyFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("Dependency Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_Dep = spacy.load("en_core_web_sm")
        self.m_Dep_features = self.m_Dep.pipe_labels['parser']
        self.Dependency()
        print("Dependency Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def Dependency(self):
        dependencies = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_Dep(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_Dep_features,0)
            labels = [x.dep_ for x in sentence]
            labels = Counter(labels)
            for key in labels.keys():
                if key in dic:
                    dic[key] += labels[key]
            dependencies.append(dic)
        dependencies_df = pd.DataFrame.from_dict(dependencies)
        self.m_new_df = dependencies_df


In [None]:
## Extraction takes time, Run only if have time

dataset_Dependency = DependencyFeatureExtraction(dataset, 'statement').GetDataFrame()
dataset_Dependency.to_csv('/content/drive/MyDrive/BTP/Files/dataset_Dependency.csv', encoding='utf-8', index = False)

###*Sentiment Features Extraction*

In [None]:
class SentimentFeatureExtraction:
    def __init__(self, uncleaned, textColumnName):
        print("Sentiment Feature Extraction Starts")
        self.m_analyzer = SentimentIntensityAnalyzer()
        self.m_new_df = pd.DataFrame()
        self.m_uncleaned = uncleaned
        self.m_textColumnName = textColumnName
        self.Sentiment()
        print("Sentiment Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
    
    def Sentiment(self):
        sentiment = [self.m_analyzer.polarity_scores(text[self.m_textColumnName]) for idx,text in self.m_uncleaned.iterrows()]
        self.m_new_df = pd.DataFrame.from_dict(sentiment)
        self.m_new_df.drop(['compound'], axis='columns',inplace=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
## Extraction takes time, Run only if have time

dataset_Sentiment = SentimentFeatureExtraction(dataset, 'statement').GetDataFrame()
dataset_Sentiment.to_csv('/content/drive/MyDrive/BTP/Files/dataset_Sentiment.csv', encoding='utf-8', index = False)