In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocess
import numpy as np
import verb_extraction
import stemming
import re


In [20]:
data0 = pd.read_csv('new/MOV.csv',encoding='utf-8')
print(data0.polarity.unique())
data0.columns

[ 1 -1  0]


Index(['text', 'polarity'], dtype='object')

In [21]:
data1 = pd.read_csv('new/ATT.csv',encoding='utf-8')
data1.drop(['Unnamed: 0'],axis=1,inplace=True)
print(data1.polarity.unique())
data1.columns

[ 1. -1.]


Index(['text', 'polarity'], dtype='object')

In [22]:
data2 = pd.read_csv('new/HTL.csv',encoding='utf-8')
print(data2.polarity.unique())
data2.columns

[ 1  0 -1]


Index(['text', 'polarity'], dtype='object')

In [23]:
data3 = pd.read_csv('new/PROD.csv',encoding='utf-8')
print(data3.polarity.unique())
data3.columns

[ 1  0 -1]


Index(['text', 'polarity'], dtype='object')

In [24]:
data4 = pd.read_csv('new/RES.csv',encoding='utf-8')
print(data4.polarity.unique())
data4.columns

[ 1  0 -1]


Index(['text', 'polarity'], dtype='object')

In [25]:
data5 = pd.read_csv('new/RES1.csv',encoding='utf-8')
data5.drop(['restaurant_id', 'user_id'],axis=1,inplace=True)
data5 =pd.DataFrame({'text':data5['text'],'polarity':data5['polarity']})
print(data5.polarity.unique())
data5.columns

[-1  1]


Index(['text', 'polarity'], dtype='object')

In [26]:
data6 = pd.read_csv('new/RES2.csv',encoding='utf-8')
print(data6.polarity.unique())
data6.columns

[ 1  0 -1]


Index(['text', 'polarity'], dtype='object')

In [27]:
data = pd.concat([data0,data1,data2,data3,data4,data5,data6])

In [28]:
data['TWEET'] = data['text']
data.drop(['text'],axis=1,inplace=True)

In [29]:
def change(polar):
    if polar == 1:
        return 'pos'
    elif polar == -1:
        return 'neg'
    else:
        return 'neutral'

In [30]:
data['pos_neg'] = data['polarity'].apply(change)
data['LABEL'] = 'none'
data.drop(['polarity'],axis=1,inplace=True)

In [31]:
data = data[(data['pos_neg']!='neutral')]

In [32]:
data['pos_neg'].unique()

array(['pos', 'neg'], dtype=object)

In [33]:
len(data)

42339

In [34]:
file = open("../../Data/stopwords.txt","r",encoding="utf-8")
stopwords = file.read().split()
file.close()
def get_tokens(text):
    dictionary = {'د':"دكتور"}#to be continued
    words_after_split = text.split()
    for index , word in enumerate(words_after_split):
        if word in dictionary.keys():
            w = dictionary[word]
            words_after_split[index] = w
    new_words = list()
    for word in words_after_split:
        if word not in stopwords:
            new_words.append(word)
    return new_words

In [35]:
tweet = list(data['TWEET'])
label = list(data['LABEL'])
pos_neg = list(data['pos_neg'])

In [36]:
import numpy as np
import verb_extraction
import stemming
import re

for i in range(len(tweet)):
    tweet[i] = preprocess.pre_process(str(tweet[i]))
    tokens = get_tokens(tweet[i])
    tokens_verb_noun = verb_extraction.extract_stem_verb(tokens,{})
    tokens_verb_noun = np.array(stemming.stem(tokens_verb_noun))
    tweet[i] = ' '.join([str(elem) for elem,_ in tokens_verb_noun])


In [37]:
for i in range(len(tweet)):    
    tweet[i] = re.sub("[a-zA-Z]", " ", tweet[i]) # remove english letters
    tweet[i] = re.sub('\n', ' ', tweet[i]) # remove \n from text
    tweet[i] = re.sub(r'\d+', '', tweet[i]) #remove number
    tweet[i] = re.sub(r'http\S+', '', tweet[i]) # remove links
    tweet[i] = re.sub(' +', ' ',tweet[i]) # remove extra space
    tweet[i] = tweet[i].strip() #remove whitespaces


In [38]:
text = list()
target = list()
posneg = list()
for i in range(len(tweet)):
    if tweet[i] == '':
        continue
    text.append(tweet[i])
    target.append(label[i])
    posneg.append(pos_neg[i])
data = pd.DataFrame({'TWEET':text,'LABEL':target,'pos_neg':posneg})


In [44]:
data.to_csv('new/final.csv',index=False)

In [45]:
data.columns

Index(['TWEET', 'LABEL', 'pos_neg'], dtype='object')

In [40]:
X_train, X_test, y_train, y_test = train_test_split(text, posneg, test_size=0.2, random_state=0)
vectorizer = TfidfVectorizer(encoding='utf-8',ngram_range=(1,3),max_df=0.85,min_df=2)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
#text_classifier = RandomForestClassifier(n_jobs=2, random_state=0)
text_classifier = LinearSVC(random_state=0,C=0.2)
text_classifier.fit(X_train, y_train)

LinearSVC(C=0.2, random_state=0)

In [42]:
from sklearn.metrics import accuracy_score
predictions = text_classifier.predict(X_test)
print(accuracy_score(y_test, predictions))

0.9299249195566679
