<a href="https://colab.research.google.com/github/Sereniiti/models-exploration/blob/develop/Ayman/04_sereniiti_with_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [67]:
# import pandas
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Data_sentences.csv')
# Dataset is now stored in a Pandas Dataframe

In [68]:
# check if any null values in words_en or ratings
print('words_en null records: ', df['words_en'].isnull().sum())
print('ratings null records: ', df['ratings'].isnull().sum())
#if there is any Null Value we can replace with 0
# df['words_en'].fillna(0)

words_en null records:  0
ratings null records:  0


In [69]:
#check the unique records in ratings column to map them with code number
df['ratings'].unique()

array(['bad', 'good', 'very violent', 'violent', 'excellent',
       'acceptable positive', 'acceptable negative'], dtype=object)

In [70]:
# map the ratings to code where 0 'violent' and 1 'good'
df['ratings_code']=df['ratings'].map({'very violent':0,'violent':0,'bad':0,'acceptable negative':1,'acceptable positive':1,'good':1,'excellent':1})

In [71]:
# save the needed columns only in the dataset
df=df[['words_en','ratings_code']]

In [72]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/Colab_Notebooks')

In [74]:
import cleaning_func

In [75]:
from cleaning_func import clean_text_punc

In [76]:
# create new column with to save cleaned text
df['text_clean']=df['words_en'].apply(lambda x : clean_text_punc(x))

In [77]:
from cleaning_func import remove_stopwords

In [78]:
# create new column for stopword cleaned text (not recommended in our case so will ignore that column)
df['text_clean_nostopword']=df['text_clean'].apply(lambda x:remove_stopwords(x))

In [50]:
df.head()

Unnamed: 0,words_en,ratings_code,text_clean,text_clean_nostopword
0,I am being abused.,1,I am being abused,I abused
1,I am unwanted.,1,I am unwanted,I unwanted
2,I don't feel heard,1,I dont feel heard,I dont feel heard
3,I don't feel supported,1,I dont feel supported,I dont feel supported
4,I don't like you,1,I dont like you,I dont like you


In [79]:
from cleaning_func import lemma_data

In [80]:
# create new column for lemmatized cleaned text
df['text_clean_lemma']=df['text_clean'].apply(lambda x:lemma_data(x))

In [81]:
df.head()

Unnamed: 0,words_en,ratings_code,text_clean,text_clean_nostopword,text_clean_lemma
0,I feel neglected by you.,0,I feel neglected by you,I feel neglected you,I feel neglected by you
1,A big boy doesn't cry,0,A big boy doesnt cry,A big boy doesnt cry,A big boy doesnt cry
2,"A cheetah, the fastest land animal, can run 70...",1,A cheetah the fastest land animal can run 70 m...,A cheetah fastest land animal run 70 miles hour,A cheetah the fastest land animal can run 70 m...
3,A combination of words that makes a complete s...,1,A combination of words that makes a complete s...,A combination words makes complete sense calle...,A combination of word that make a complete sen...
4,A door would have more charisma than you,0,A door would have more charisma than you,A door would charisma you,A door would have more charisma than you


In [82]:
# vectorization encode a text as integers to create feature vectores ( vectore of numerical feature that represent an object)
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# create instance from CountVectorizer
text_vectorized=CountVectorizer()

In [83]:
# save the CountVectorized data in word_vectorized, and rating_code values in y
word_vectorized= text_vectorized.fit_transform(df['text_clean'])
y = df['ratings_code'].values
print ('Word_vectorized data shape:\n ', word_vectorized.shape,'\n')

Word_vectorized data shape:
  (7480, 5092) 



In [84]:
from sklearn.model_selection import train_test_split # to split our dtata set into training data and test data

In [85]:
# define the training and test sets, 0.2 means test data is %20 of the trained data will stored in the X_test
X_training, X_testing, y_train, y_test = train_test_split(word_vectorized, y, test_size = 0.25, random_state=1000) 

In [86]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression # 
from sklearn.metrics import accuracy_score #


# adding training and testing set defining the LinearSVC instance firest to fit the training set in
model_lr = LogisticRegression()
# fit x and y training set in the model
model_lr.fit(X_training, y_train)

X_train_prediction=model_lr.predict(X_training)
training_data_accuracy=accuracy_score(X_train_prediction, y_train)

X_test_prediction=model_lr.predict(X_testing)
test_data_accuracy=accuracy_score(X_test_prediction, y_test)

print('CountVectorizer - Accuracy score of the training: ',round(training_data_accuracy,3) )
print('CountVectorizer - Accuracy score of the test : ',round(test_data_accuracy,3) )

CountVectorizer - Accuracy score of the training:  0.947
CountVectorizer - Accuracy score of the test :  0.878


In [87]:
from sklearn.metrics import confusion_matrix
confusion_matrix(X_test_prediction, y_test)

array([[749, 111],
       [117, 893]])

In [88]:
df['text_clean_lemma'].value_counts()

Youre a pain in the as                                                                                                 5
bitch                                                                                                                  4
I feel neglected                                                                                                       4
Good morning                                                                                                           4
You deserve to be slapped                                                                                              4
                                                                                                                      ..
You always have to take detour                                                                                         1
Im afraid that I cant make tomorrow meeting                                                                            1
Im really interested in working 

In [89]:
# sorting by first name
df.sort_values("text_clean_lemma", inplace = True)
df.drop_duplicates(subset='text_clean_lemma', keep='first', inplace=True)

In [90]:
df['text_clean_lemma'].value_counts()

You are so psychotic                                         1
If you had the gut youd do this                              1
You have no capacity for abstraction                         1
You cant keep a straight face                                1
We dont do anything together except for the kid              1
                                                            ..
Whats with the outfit                                        1
I feel frustrated because you did this                       1
I didnt sleep last night I feel inert because I need rest    1
You never follow good practice                               1
I feel suprised                                              1
Name: text_clean_lemma, Length: 7300, dtype: int64

In [91]:
confusion_matrix(X_test_prediction, y_test)

array([[749, 111],
       [117, 893]])

In [92]:
df['ratings_code'].value_counts()

1    3964
0    3336
Name: ratings_code, dtype: int64

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer()

Xtf_training = tfidf_vect.fit_transform(df['text_clean'])
y = df['ratings_code'].values

from sklearn.model_selection import train_test_split # to split our dtata set into training data and test data
# define the training and test sets, 0.2 means test data is %20 of the trained data will stored in the X_test
Xt_training, Xt_testing, y_train, y_test = train_test_split(Xtf_training, y, test_size = 0.25, random_state=1000) 

In [94]:
# LogisticRegression for tfidf vectorization
from sklearn.linear_model import LogisticRegression # 
from sklearn.metrics import accuracy_score #
# adding training and testing set defining the LinearSVC instance firest to fit the training set in
model_lr_tf = LogisticRegression()
# fit x and y training set in the model
model_lr_tf.fit(Xt_training, y_train)

Xt_train_prediction=model_lr_tf.predict(Xt_training)
training_data_accuracy_t=accuracy_score(Xt_train_prediction, y_train)

Xt_test_prediction=model_lr_tf.predict(Xt_testing)
test_data_accuracy_t=accuracy_score(Xt_test_prediction, y_test)

print('TfidfVectorizer - Accuracy score of the training:',round(training_data_accuracy_t,3) )
print('TfidfVectorizer - Accuracy score of the test:',round(test_data_accuracy_t,3) )

TfidfVectorizer - Accuracy score of the training: 0.924
TfidfVectorizer - Accuracy score of the test: 0.884


In [96]:
# data entry test model
#========================
#input the data for testing the model
X_new = [input('please input your text to test : ')]
# save the input data in a new data frame
X_new_data=pd.DataFrame({0:X_new})

#vectorize the inputs and create the training set to use the input data to test from
test_vectorized=CountVectorizer()
training_features = test_vectorized.fit_transform(df['text_clean'])    
test_features = test_vectorized.transform(df['text_clean'])
model_test = LogisticRegression()
model_test.fit(training_features, df['ratings_code'])
y_pred = model_test.predict(test_features)

#predict the vectorized inputs
X_new_test = test_vectorized.transform(X_new_data[0])  
prediction=model_test.predict(X_new_test)
print(prediction)

please input your text to test : this is bad
[0]


In [97]:
from cleaning_func import predict
predict(prediction)

violent
