<a href="https://colab.research.google.com/github/Sereniiti/models-exploration/blob/develop/Ayman/01_Sereniiti_LogisticRegression_0_or_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd # for data frame
import re # regular expression library for searching words in a text or paragraph
from nltk.stem.porter import PorterStemmer # stemming function takes the word and remove prefex and suffix of that word and returns the root word
from sklearn.feature_extraction.text import TfidfVectorizer #to convert the words to feature  vectors
from sklearn.model_selection import train_test_split # to split our dtata set into training data and test data
from sklearn.linear_model import LogisticRegression # 
from sklearn.metrics import accuracy_score #


In [2]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
data_sentences = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data_sentences.csv')
# Dataset is now stored in a Pandas Dataframe

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# show first 5 records
data_sentences.head(5)

Unnamed: 0,words_en,words_Fr,ratings
0,I am being abused.,Je suis maltraité.,acceptable negative
1,I am unwanted.,Je suis indésirable.,acceptable negative
2,I don't feel heard,Je ne me sens pas entendu,acceptable negative
3,I don't feel supported,Je ne me sens pas soutenue,acceptable negative
4,I don't like you,Je ne t'aime pas,acceptable negative


In [6]:
data_sentences.shape

(7482, 3)

In [7]:
data_sentences.tail(5)

Unnamed: 0,words_en,words_Fr,ratings
7477,Thank you for walking through this life with m...,Merci d'avoir traversé cette vie avec moi. Je ...,good
7478,Thank you.,Merci.,good
7479,Thanks for your help.,Merci pour votre aide.,good
7480,That is hardly important,Ce n'est pas important.,bad
7481,"You are delusional, absolutely nothing like th...","Tu délire, absoluement rien de tout cela n'est...",violent


In [8]:
#check missing values
data_sentences.isnull().sum()

words_en    0
words_Fr    0
ratings     0
dtype: int64

In [9]:
#check the unique records
data_sentences['ratings'].unique()

array(['acceptable negative', 'acceptable positive', 'bad', 'violent',
       'very violent', 'good', 'excellent'], dtype=object)

In [11]:
# map the ratings to code where 0 'very violent' until 6 'excellent'
data_sentences['ratings_code']=data_sentences['ratings'].map({'very violent':0,'violent':0,'bad':0,'acceptable negative':1,'acceptable positive':1,'good':1,'excellent':1})

In [12]:
data_sentences.head()

Unnamed: 0,words_en,words_Fr,ratings,ratings_code
0,I am being abused.,Je suis maltraité.,acceptable negative,1
1,I am unwanted.,Je suis indésirable.,acceptable negative,1
2,I don't feel heard,Je ne me sens pas entendu,acceptable negative,1
3,I don't feel supported,Je ne me sens pas soutenue,acceptable negative,1
4,I don't like you,Je ne t'aime pas,acceptable negative,1


In [13]:
# separating rating storing Scentinces in Y and rating in X
X=data_sentences['words_en']
Y=data_sentences['ratings_code']

In [14]:
Y.unique()

array([1, 0])

In [15]:
# stemming is the process of reducing a word to Root word
port_stem = PorterStemmer()

In [16]:
# create stemming function with input variable 
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content) # calling reguler expression searching for only using^ (from a to z and from A to Z) execluding others like numbers, especial characters
    stemmed_content=stemmed_content.lower() # convert all to lower case
    stemmed_content=stemmed_content.split() # split and convert to a list
#    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] # taking each word removing the stop words using the for loop
    stemmed_content=' '.join(stemmed_content) # joining all the words after stemming
    return stemmed_content # return the result

In [17]:
data_sentences['words_stemmed']=data_sentences['words_en'].apply(stemming)

In [18]:
X = data_sentences['words_stemmed'].values
Y = data_sentences['ratings_code'].values

In [19]:
# convert the textual data to numerical data vectors
vectorizer = TfidfVectorizer() # Tf for term frequency and idf transfer inverse document frequency , basicaly counts the number of times a particular word is repeating and assign a numerical value  
vectorizer.fit(X)
X = vectorizer.transform(X)

In [None]:
X.shape

(7482, 5026)

In [None]:
Y.shape

(7482,)

In [20]:
#splitting the dataset to training & test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2) # 0.2 means test data is %20 of the trained data will stored in the X_test

In [None]:
X_train.shape

(5985, 5026)

In [None]:
Y_train.shape

(5985,)

In [None]:
X_test.shape

(1497, 5026)

In [21]:
model=LogisticRegression()

In [22]:
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction, Y_train)

In [24]:
print(' Accuracy score of the training:',training_data_accuracy )

 Accuracy score of the training: 0.9191311612364244


In [25]:
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction, Y_test)

In [26]:
print(' Accuracy score of the test:',test_data_accuracy )

 Accuracy score of the test: 0.8851035404141616


In [27]:
# test any record in the test set
X_new = X_test[5]
prediction=model.predict(X_new)
print(prediction)

[1]


In [31]:

X_new = [input('please input your text to test : ')]
X_new_data=pd.DataFrame({0:X_new},index={0})
X_new_stemming=X_new_data[0].apply(stemming)
X_new_in = vectorizer.transform(X_new_stemming)
prediction=model.predict(X_new_in)
print(prediction)

#'very violent':0,'violent':1,'bad':2,'acceptable negative':3,'acceptable positive':4,'good':5,'excellent':6
if (prediction[0]==0):
    print('bad')
elif (prediction[0]==1):
    print('good')
else:
    print('unable to predict')

please input your text to test : this is too bad
[0]
bad
