In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [2]:
# importing training data
df = pd.read_csv('./datasets/computer_train.csv')
df.columns = ['example_id','text','aspect_term','term_location','class']
df[['text', 'aspect_term']].head()

Unnamed: 0,text,aspect_term
0,Obviously one of the most important features o...,human interface
1,Good for every day computing and web browsing.,every day computing
2,while the keyboard itself is alright[comma] th...,mouse command buttons
3,Again[comma] the same problem[comma] the right...,right speaker
4,My problem was with DELL Customer Service.,DELL Customer Service


In [3]:
df.shape

(2203, 5)

In [4]:
# importing testing data
df_test1 = pd.read_csv('./datasets/computer_test.csv')
df_test1.columns = ['example_id','text','aspect_term','term_location']
df_test1.head()

Unnamed: 0,example_id,text,aspect_term,term_location
0,494:1_0,Buy the separate RAM memory and you will have ...,RAM memory,17--27
1,311:18_1,But with this laptop[comma] the bass is very w...,sound,52--57
2,256:1_0,This hardware seems to be better than the iMac...,hardware,5--13
3,748:1_0,It is very easy to navigate even for a novice.,navigate,19--27
4,128:1_0,Amazing Performance for anything I throw at it.,Performance,8--19


In [5]:
def text_prep(text):
    text = text.replace("[comma]", "")
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [WordNetLemmatizer().lemmatize(text, pos='v') for text in text.lower().split() if text not in set(stopwords.words('english'))] 
    text = " ".join(text)
    return word_tokenize(text)

In [6]:
def get_weights(x):
    text = x[0]
    aspect = x[1]
    if set(aspect) <= set(text):
        left_weights = right_weights = []
        start_list= [i for i, x in enumerate(text) if x == aspect[0]]
        for q in (start_list):
            if  text[(q + len(aspect) - 1)] == aspect[-1]:
                start_index = q
                end_index = q + len(aspect) - 1
                break
        if (end_index - start_index) == len(aspect) - 1:
            left_text = text[:start_index]
            right_text = text[end_index+1:]
            left_weights = [1/i for i in range(len(left_text),0,-1) if len(left_text) != 0]
            right_weights = [1/i for i in range(1,len(right_text)+1) if len(right_text) != 0]
        total_weights = left_weights + [2]*len(aspect) + right_weights
        return dict(zip(text,total_weights))
    else: 
        return np.nan

## Data Preprocessing

In [7]:
# preprocess text and aspect_term columns
df['prep_text'] = df['text'].apply(text_prep)
df['prep_aspect_term'] = df['aspect_term'].apply(text_prep)
df[['prep_text', 'prep_aspect_term']].head()

Unnamed: 0,prep_text,prep_aspect_term
0,"[obviously, one, important, feature, computer,...","[human, interface]"
1,"[good, every, day, compute, web, browse]","[every, day, compute]"
2,"[keyboard, alright, plate, around, cheap, plas...","[mouse, command, button]"
3,"[problem, right, speaker, work]","[right, speaker]"
4,"[problem, dell, customer, service]","[dell, customer, service]"


In [8]:
# preprocessing testing data
df_test1['prep_test1_text'] = df_test1['text'].apply(text_prep)
df_test1['prep_test1_aspect_term'] = df_test1['aspect_term'].apply(text_prep)
df_test1[['prep_test1_text', 'prep_test1_aspect_term']].head()

Unnamed: 0,prep_test1_text,prep_test1_aspect_term
0,"[buy, separate, ram, memory, rocket]","[ram, memory]"
1,"[laptop, bass, weak, sound, come, sound, tinny]",[sound]
2,"[hardware, seem, better, imac, isnt, 1400, sma...",[hardware]
3,"[easy, navigate, even, novice]",[navigate]
4,"[amaze, performance, anything, throw]",[performance]


## Assigning weights

In [9]:
# weighing training data
df['weights_score'] = df[['prep_text','prep_aspect_term']].apply(get_weights, axis = 1)
df = df.dropna()
df.iloc[0]['weights_score']

{'obviously': 0.2,
 'one': 0.25,
 'important': 0.3333333333333333,
 'feature': 0.5,
 'computer': 1.0,
 'human': 2,
 'interface': 2}

In [10]:
# weighing testing data
df_test1['test1_weights_score'] = df_test1[['prep_test1_text','prep_test1_aspect_term']].apply(get_weights, axis = 1)
df_test1 = df_test1.dropna()
# df_test1.iloc[1]['test1_weights_score']
df_test1.head()

Unnamed: 0,example_id,text,aspect_term,term_location,prep_test1_text,prep_test1_aspect_term,test1_weights_score
0,494:1_0,Buy the separate RAM memory and you will have ...,RAM memory,17--27,"[buy, separate, ram, memory, rocket]","[ram, memory]","{'buy': 0.5, 'separate': 1.0, 'ram': 2, 'memor..."
1,311:18_1,But with this laptop[comma] the bass is very w...,sound,52--57,"[laptop, bass, weak, sound, come, sound, tinny]",[sound],"{'laptop': 0.3333333333333333, 'bass': 0.5, 'w..."
2,256:1_0,This hardware seems to be better than the iMac...,hardware,5--13,"[hardware, seem, better, imac, isnt, 1400, sma...",[hardware],"{'hardware': 2, 'seem': 1.0, 'better': 0.5, 'i..."
3,748:1_0,It is very easy to navigate even for a novice.,navigate,19--27,"[easy, navigate, even, novice]",[navigate],"{'easy': 1.0, 'navigate': 2, 'even': 1.0, 'nov..."
4,128:1_0,Amazing Performance for anything I throw at it.,Performance,8--19,"[amaze, performance, anything, throw]",[performance],"{'amaze': 1.0, 'performance': 2, 'anything': 1..."


## Vectorizing data using CountVectorizer and Tf-idf Transformer

In [11]:
# training data vectorized
vec = CountVectorizer()
doc_matrix = vec.fit_transform(df['text'])
vocab = list(vec.vocabulary_.keys())
df_weights = pd.DataFrame(np.zeros((len(df),len(vocab))),columns=vocab)
for row in range(len(df)):
    for key,value in df.iloc[row]['weights_score'].items():
        df_weights.iloc[row][key] = value
tfidf_vec = TfidfTransformer()
tfidf = tfidf_vec.fit_transform(df_weights)

In [12]:
# testing data vectorized
test1_doc_matrix = vec.transform(df_test1['text'])
test1_vocab = list(vec.vocabulary_.keys())
df_test1_weights = pd.DataFrame(np.zeros((len(df_test1),len(test1_vocab))),columns=test1_vocab)
for row in range(len(df_test1)):
    for key,value in df_test1.iloc[row]['test1_weights_score'].items():
        df_test1_weights.iloc[row][key] = value

test1_tfidf = tfidf_vec.transform(df_test1_weights)

## Random Forest

In [13]:
# Choosing Random Forest Classifier as our final model
rfc = RandomForestClassifier(n_estimators=50,max_depth=2000, n_jobs = -1)
crv_rfc = cross_val_predict(rfc,tfidf,df['class'],cv = 10)
np.mean(crv_rfc == df['class'])

0.7147435897435898

In [14]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_rfc,Y))
rfc = RandomForestClassifier(n_estimators=50,max_depth=2000).fit(tfidf, df['class'])
test_rfc = rfc.predict(test1_tfidf)


 Classification Report 
                precision    recall  f1-score   support

          -1       0.81      0.67      0.73       999
           0       0.36      0.68      0.47       228
           1       0.79      0.77      0.78       957

    accuracy                           0.71      2184
   macro avg       0.66      0.71      0.66      2184
weighted avg       0.76      0.71      0.73      2184



# AdaBoost

In [None]:
boost = AdaBoostClassifier(n_estimators=100)
crv_boost = cross_val_predict(boost,tfidf,df['class'],cv = 10)
np.mean(crv_boost == df['class'])

In [None]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_boost,Y))
boost = AdaBoostClassifier(n_estimators=100).fit(tfidf, df['class'])
test_boost = boost.predict(test1_tfidf)

## Output file

In [None]:
# Please remember to delete the txt file created before otherwise it will append to the previous results

with open('Result.txt', 'a') as out:
    for i,j in zip(df_test1['example_id'],test_rfc):
        out.write(str(i) + ";;" + str(j) + "\n")

## Linear Support Vector Machine

In [None]:
svm = LinearSVC()
crv_svm = cross_val_predict(svm,tfidf,df['class'],cv = 10)
np.mean(crv_svm == df['class'])


In [None]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_svm,Y))

## Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()
crv_mnb = cross_val_predict(mnb,tfidf,df['class'],cv = 10)
np.mean(crv_mnb == df['class'])


In [None]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_mnb,Y))