In [1]:
! unzip files

Archive:  files.zip
   creating: dataset/
  inflating: dataset/train.csv       
  inflating: dataset/test.csv        


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [5]:
!pip install langdetect

Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 2.8MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.8-cp36-none-any.whl size=993193 sha256=ecf6db77f61642020691c9e4c14cf1bd590d4f1aef71015a678e63613b384302
  Stored in directory: /root/.cache/pip/wheels/8d/b3/aa/6d99de9f3841d7d3d40a60ea06e6d669e8e5012e6c8b947a57
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.8


In [6]:
! pip install googletrans

Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/fd/f0/a22d41d3846d1f46a4f20086141e0428ccc9c6d644aacbfd30990cf46886/googletrans-2.4.0.tar.gz
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans: filename=googletrans-2.4.0-cp36-none-any.whl size=15777 sha256=3a4cf385bf622fa96320f7416ced079f1a35b4e6f4bee4e769a3794491848de7
  Stored in directory: /root/.cache/pip/wheels/50/d6/e7/a8efd5f2427d5eb258070048718fa56ee5ac57fd6f53505f95
Successfully built googletrans
Installing collected packages: googletrans
Successfully installed googletrans-2.4.0


In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from googletrans import Translator
from langdetect import detect 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.externals import joblib


In [0]:
class PreprocessData():
    def __init__(self,csv_file):
        self.dataset = pd.read_csv(csv_file)
        self.convert_baseform = PorterStemmer()
        self.english_word = words = set(nltk.corpus.words.words())

    def preprocess_string(self,data):
        try:
            data = data.lower()
            tweet = re.sub("i’m" ,'i am',data)
            tweet = re.sub("doesn’t",'does not',tweet)
            tweet = re.sub("don’t",'do not',tweet)
            tweet = re.sub("mother’s",'mothers',tweet)
            tweet = re.sub("it’s",'it is',tweet)
            tweet = re.sub("can’t",'can not',tweet)
            tweet = re.sub("mum’s",'mums',tweet)
            brk_str = tweet.split(' ')
            filter_hashtag = ' '.join(filter(lambda x:x[0]!='#',brk_str))
            filter_at_the_rate = ' '.join(filter(lambda x:x[0]!='@',filter_hashtag.split(' ')))
            remove_twitter_link = re.sub(r'(\s)pic.twitter.com/\w+','',filter_at_the_rate)

            remove_broken_url=""
            if(remove_twitter_link.find("https")):
                remove_broken_url  = remove_twitter_link.split('https')[0].strip(' ')
            elif(remove_twitter_link.find("http")):
                remove_broken_url  = remove_twitter_link.split('http')[0].strip(' ')

            remove_puncations = ""
            if(len(remove_broken_url)!=0):
                remove_puncations = re.sub(r'[^\w\s]','',remove_broken_url)
            else:
                remove_puncations = re.sub(r'[^\w\s]','',remove_twitter_link)
            
            text_tokens = word_tokenize(remove_puncations)
            remove_stopwords = [word for word in text_tokens if not word in stopwords.words('english')]
            word_base_form = [self.convert_baseform.stem(word) for word in remove_stopwords]
            
            final_preprocessed_string =  ' '.join(word_base_form)
            final_preprocessed_string = " ".join(w for w in word_tokenize(final_preprocessed_string) if w is not w.isalpha())
            return final_preprocessed_string

        except:
            print("Error in format")
            return data.lower()

    def clean_dataset(self):
        self.dataset['original_text'] = self.dataset['original_text'].apply(self.preprocess_string)

    def getDataset(self):
        return self.dataset
    
    def save_preprocessed_data_set(self):
        self.dataset.to_csv('cleaned_comment.csv',index = False)


In [0]:
class SentimentModel():
    def __init__(self,dataset,testdata):
        self.test_dataset = testdata

        self.dataset = pd.read_csv(dataset)
        self.dataset['sentiment_class'] = self.dataset['sentiment_class'].apply(self.convert_neg_to_pos)
        self.text = self.dataset['original_text'].values
        self.label = self.dataset['sentiment_class'].values
        self.x_train,self.y_train = None,None
        self.x_test,self.y_test = None,None
        self.vectorizer = TfidfVectorizer()
        self.mlModels = {'GaussianNB': GaussianNB,
                         'MultinomialNB': MultinomialNB,
                         'SVC': SVC,
                         'KNeighborsClassifier': KNeighborsClassifier,
                         'DecisionTreeClassifier': DecisionTreeClassifier,
                         'RandomForestClassifier': RandomForestClassifier}
        self.model = MultinomialNB()

    def convert_neg_to_pos(self,n):
        if(n== -1):
            return 2
        return n
    
    def convert_pos_to_neg(self,n):
        if(n==2):
            return -1
        return n

    def split_data(self):
        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.text,self.label, test_size =0.1, random_state=42)
    
    def vectorize_data(self):
        self.split_data()
        self.x_train = self.vectorizer.fit_transform(self.x_train)
        self.x_test = self.vectorizer.transform(self.x_test)
    
    def train_model(self):
        self.model.fit(self.x_train.toarray(), self.y_train)

    def test_accuratcy(self):
        y_true, y_pred = self.y_test, self.model.predict(self.x_test.toarray())
        joblib.dump(self.model, 'model.pkl') 
        print(accuracy_score(y_true,y_pred))
    
    def load_model(self):
        self.model = joblib.load('model.pkl')

    def pre_process_data(self):
        ppr = PreprocessData(self.test_dataset)
        ppr.clean_dataset()
        return ppr.getDataset()

    def prepare_result(self):
        test_df = self.pre_process_data()
        comments = test_df['original_text'].values
        comments  = self.vectorizer.transform(comments)
        res = self.model.predict(comments)
        test_df['sentiments'] = res
        test_df['sentiments']= test_df['sentiments'].apply(self.convert_pos_to_neg)
        new_df = {
            'id':test_df['id'].values,
            'sentiment_class': test_df['sentiments'].values
        }
        new_df = pd.DataFrame(new_df)
        return new_df

    def getDataset(self):
        return self.dataset


In [0]:
if __name__ == "__main__":
    data = PreprocessData("dataset/train.csv")
    data.clean_dataset()
    data.save_preprocessed_data_set()
    
    smtAnly = SentimentModel("cleaned_comment.csv","dataset/test.csv")
    smtAnly.vectorize_data()
    smtAnly.train_model()
    val = smtAnly.prepare_result()
    val.to_csv('sub.csv',index = False)