# Importing the dataset into a dataframe

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

import seaborn as sns

from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
import sklearn.model_selection as ms
import sklearn.preprocessing as preprocess
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import classification_report
    


In [2]:
# !pip install pyarabic
# !pip install langdetect
# !pip install nltk

# from langdetect import detect
# import pyarabic.araby as araby
# nltk.download("all")


In [3]:
dataset_folder_path="/home/youssef/AUC/Spring22/CSCE493002 - Machine Learning/project/datasets"
df = pd.read_csv(dataset_folder_path+'/cleanedText.csv')
df.head()

Unnamed: 0,label,text
0,1,متز نوع ما نظف وقع جهز شاطيء طعم
1,1,احد سبب نجح امر كل شخص هذه دول عشق ترب نحن نحب...
2,1,هدف نقل صخب شرع قهر هدء جبل شيش عرف حقق ما جرى...
3,1,خلص بدء الل بهر زي فيل زرق حمد راد خطى رحل قرء...
4,1,ياس جزء لا دبي ندق كامل خدم ريح نفس وجد


In [4]:
df_copy=df.copy()

In [5]:
df_copy["text"]=df_copy['text'].values.astype('U')
df

Unnamed: 0,label,text
0,1,متز نوع ما نظف وقع جهز شاطيء طعم
1,1,احد سبب نجح امر كل شخص هذه دول عشق ترب نحن نحب...
2,1,هدف نقل صخب شرع قهر هدء جبل شيش عرف حقق ما جرى...
3,1,خلص بدء الل بهر زي فيل زرق حمد راد خطى رحل قرء...
4,1,ياس جزء لا دبي ندق كامل خدم ريح نفس وجد
...,...,...
66661,0,عرفش ليه كنت كمل وهي مش عجب حدث بطء ممل روي اط...
66662,0,لا سحق يكون كنق لنه سيء شي وجد خدم فطر صبح ستي...
66663,0,ضعف جدا ولم متع به كل قصه سرد لحل شهد بدن فكر
66664,0,ملة جدا حمد حسن علو فنن وصف عند دقق حد ثني قرء...


# Shuffle

In [6]:
#shuffling
from  sklearn.utils import shuffle
df_shuffled=shuffle(df_copy,random_state=0)
df_shuffled

Unnamed: 0,label,text
64069,0,كبس كان غير نظف رة، شرشف سرر ستر حمم ليء اليوم...
46223,0,رغم عجب وسف زيد بدع ءرخ قدر لكن ظل كانت وحد اس...
45496,0,حرم سعر خرج دخل له صعب وقع تعب رهق وجد
39719,0,جمل عين وصف فءة ذكر وفه نصح فيد شكل التي يقع ب...
31607,1,جمل ظرف رغم قلة عدد صفح الا فكر متع دعو ءمل شك...
...,...,...
41993,0,خيب امل سيء لنه ادي فقط
21243,1,جيد وقف سير لم تكن كفي
45891,0,جدد ديو هو ضفة رسم قصد غير تلك مثل رءة لبس ظهر...
42613,0,جنب كان اذا طعت لم يتم وفر نشف وكان علي طلب كل...


In [7]:


#Splitting data
x=df_shuffled['text']
y=np.expand_dims(df_shuffled['label'],axis=1)
X_train,X_test,Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train,Y_train,test_size=0.25,random_state=1)

print("Train: " ,X_train.shape,Y_train.shape,"\nValidation:",X_val.shape,Y_val.shape,"\nTest: ",(X_test.shape,Y_test.shape))

Train:  (39999,) (39999, 1) 
Validation: (13333,) (13333, 1) 
Test:  ((13334,), (13334, 1))


# TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)
tf_x_val=vectorizer.transform(X_val)

In [9]:
vectorizer_file_path="tfidfVectorizerDump.joblib"
with open(vectorizer_file_path,"wb") as f:
        pickle.dump(vectorizer, f)

In [30]:
    print(tf_x_train.shape,tf_x_val.shape,tf_x_test.shape)

(39999, 3594) (13333, 3594) (13334, 3594)


## Nerual Networks

In [31]:
class AdamOptimizer:
    def __init__(self,beta1,beta2,alpha,eps=10e-8):
#         self.params=params
#         self.n_iter=n_iter
        self.beta1=beta1
        self.beta2=beta2
        self.alpha=alpha
        self.eps=eps
        self.ms=[]
        self.vs=[]
        
    def reset_params(self,layers):
        self.ms=[ 
                  [np.zeros_like(layer.W,dtype=np.float64), np.zeros_like(layer.b,dtype=np.float64)] 
                  for layer in layers      
                 ]
        self.vs=[ 
                  [np.zeros_like(layer.W,dtype=np.float64), np.zeros_like(layer.b,dtype=np.float64)] 
                  for layer in layers      
                 ]
#         print("vs:",self.vs[0][0])
        
    def update(self,layers,N):
#         print(self.vs[0][0])
        for i in range(len(layers)):
#             print("i:",i)
#             print("vs part1: ",self.vs[i][0])
#             print("beta2:",self.beta2)
            self.ms[i][0]= self.beta1*self.ms[i][0]+(1.0-self.beta1)*layers[i].dW
            self.ms[i][1]= self.beta1*self.ms[i][1]+(1.0-self.beta1)*layers[i].db
            
#             print("before: vs of",i," = ", self.vs[i][0])
            self.vs[i][0]= self.beta2*self.vs[i][0]+(1.0-self.beta2)*np.square(layers[i].dW)
            self.vs[i][1]= self.beta2*self.vs[i][1]+(1.0-self.beta2)*np.square(layers[i].db)
#             print("after: vs of",i," = ", self.vs[i][0])

#             print("vs:",self.vs[i][0])
#             print("eps:", self.eps)
            denDW= np.sqrt((self.vs[i][0] + self.eps))
            denB=(np.sqrt((self.vs[i][1] + self.eps)))
            
            numDW=(-1 * self.alpha * self.ms[i][0])
            numB=(-1 * self.alpha * self.ms[i][1])
                    
            deltaW = np.array(numDW /denDW ,dtype=np.float64)
            deltab = np.array( numB/ denB  ,dtype=np.float64)
        
#             print("deltaW",deltaW)
#             print("deltab",deltab)
            layers[i].W +=  deltaW/N
            layers[i].b +=  deltab/N
#             layers[i].W +=  deltaW/np.sqrt(N)
#             layers[i].b +=  deltab/np.sqrt(N)
        
# class GradientDescent:
#     def __init__(self,alpha):
#         self.alpha=alpha
#     def reset_params(self,layers):
#         pass
#     def update(self,layers,N):
#         for i in range(len(layers)):
#             # layers[i].dW=layers[i].dW/N
#             # layers[i].db=layers[i].db/N
#             layers[i].W = layers[i].W - self.alpha * (layers[i].dW/N)
#             layers[i].b = layers[i].b - self.alpha * (layers[i].db/N)

In [32]:
import numpy as np
import pickle


class Layer:
    
    ### activations
    def _relu(self,z):
        return np.maximum(0,z)
    def _diff_relu(self,z):
        dZ=np.array(z,copy=True)
        dZ[dZ<=0]=0
        dZ[dZ>0]=1
        return dZ
    
    def _identity(self,z):
        return z
    
    def _identity_diff(self,z):
        return np.ones_like(z)
    
    def _sigmoid(self,z):
        return (1/(1+np.exp(-1*z)))

    def _diff_sigmoid(self,z):
        return self._sigmoid(z)*(1-self._sigmoid(z))
    
    def _softmax(self,z):
        expZ= np.exp(z-np.max(z))
        return expZ/expZ.sum(axis=0, keepdims=True)
    def _diff_softmax(self,z):
        pass

    
    ###########

    def __init__(self,n_input,n_output, activation="identity",name=None):
        self.n_output= n_output
        self.n_input= n_input
        self.name= name
        
        if activation == "identity":
            self.activation = self._identity
            self.diff_act= self._identity_diff
        
        elif activation == "sigmoid":
            self.activation = self._sigmoid
            self.diff_act= self._diff_sigmoid
            
        elif activation == "softmax":
            self.activation=self._softmax
            self.diff_act=self._diff_softmax
        elif activation =="relu":
            self.activation=self._relu
            self.diff_act=self._diff_relu
            
        self.reset_params()
            
        
    def reset_params(self): 
        self.W= np.random.randn(self.n_output,self.n_input)*np.sqrt(2/self.n_input)
        self.b= np.random.randn(self.n_output,1)*np.sqrt(2/self.n_input)

        self.dW= np.zeros_like(self.W)
        self.db= np.zeros_like(self.b)
        
        self.Z= None
        self.Ai = None
    def print_shapes(self):
        print("W: ",self.W.shape)
        print("b: ",self.b.shape)
    
    def forward(self,Ai): #data dim 

        z =  np.add((self.W @ Ai),self.b)
        A = self.activation(z)

        
        self.Z = z
        self.Ai = Ai
        return A
    
    
    def backward(self,inp):
        
#         print("input shape: ",end='')
#         print(inp.shape)
       
        act_diff = self.diff_act(self.Z)
#         print("act_diff shape: ",end='')
#         print(act_diff.shape)
        
        tmp = inp * act_diff
#         print("tmp shape: ",end='')
#         print(tmp.shape)
        
        bet = tmp @ self.Ai.T # vector of 1s
#         print("bet shape: ",end='')
#         print(bet.shape)
        
        
        e = np.ones((self.Ai.shape[1],1))
        db = tmp @ e
#         print("db shape: ",end='')
#         print(db.shape)
        self.dW = (self.dW + bet)
#         print("dw:",self.dW.shape,"\nlen:",len(self.dW))
        self.db = self.db + db
        
        
        return self.W.T @ tmp
    
    def print_weights(self):
        print("\n###################")
        if(self.name):
            print("name: ",self.name)
        print("dW: ",self.dW, "W: ",self.W)
    
    def zeroing_delta(self):
        self.dW= np.zeros_like(self.W)
        self.db= np.zeros_like(self.b)


class NN:
    
    ########
    ## losses
    def _MSE(self,y,yhat):
        a=np.square(yhat-y)
        a=np.sum(a)
        b= 1/(2*y.shape[1])
        return a*b

    ## diff losses
    def _diff_MSE(self,y,yhat,X):
        return (yhat-y)
    
    def _binary_cross_entropy(self,y,yhat):
        arr= -(y*np.log(yhat)+(1-y)*np.log(1-yhat))
        return arr.mean()
        
    def _diff_binary_cross_entropy(self,y,yhat,X):
        dl_dyhat= -(y/(yhat) - (1-y)/(1-yhat))
        return dl_dyhat
 
    
    #########
    
    def __init__(self,optimizer=None,loss="binary_cross"):
        self.layers = []
        self.optimizer=optimizer
        self.loss_name=loss
        self.initialize_loss()
    
   
    def initialize_loss(self): 
        if(self.loss_name=="binary_cross"):
            self.loss=self._binary_cross_entropy
            self.loss_diff=self._diff_binary_cross_entropy
        elif self.loss_name=="MSE":
            self.loss=self._MSE
            self.loss_diff=self._diff_MSE
        
    
    def reset_layers(self):
            for layer in self.layers:
                layer.reset_params()
    
    def forward(self,x_train):
        a=x_train
        for layer in self.layers:
            a = layer.forward(a)
        return a
    
    def backward(self,input):
        gd = input
        for layer in self.layers[::-1]:
            gd = layer.backward(gd)
            
    def add_layer(self,n_input,n_output, activation="identity",name=None):
        self.layers.append(Layer(n_input,n_output, activation=activation,name=name))
    
    def batch(self,x,y,batch_size):
        x= x.copy()
        y=y.copy()
        reminder= x.shape[0] % batch_size


        for i in range(0,x.shape[0],batch_size):
            yield (x[i:i+batch_size],y[i:i+batch_size])
        
        if reminder !=0:
            yield (x[x.shape[0]-reminder:],y[x.shape[0]-reminder:] )
    
    def fit(self, x_train,y_train,validation_data=None,batch_size=32, epochs=5): #data dim is MxN .. M no of examples.. N no of dimension
        
        M = x_train.shape[0]

        no_of_batches= np.ceil(M/batch_size)
        if(validation_data):
            x_valid=validation_data[0]
            y_valid=validation_data[1]
        
        
        for i in range(epochs):
            
            print("Epoche {}/{}".format(i+1,epochs))
            self.optimizer.reset_params(self.layers)
            batches=self.batch(x_train,y_train,batch_size)
            losses=[]
            j=0
            for cur_x,cur_y in batches:
                
                cur_x=cur_x.T
                cur_y=cur_y.T
                
                y_hat= self.forward(cur_x)

                dl_dyhat = self.loss_diff(cur_y,y_hat,self.layers[-1].Ai)
                loss=self.loss(cur_y,y_hat)
                
                losses.append(loss)

                self.backward(dl_dyhat)
                
                if batch_size==1:
                    N= M
                else:
                    N=cur_x.shape[-1]
                
                self.optimizer.update(self.layers,N)

                # zeroing deltas
                for layer in self.layers:
                    layer.zeroing_delta()
                j+=1
                
            if validation_data:
                y_hat_val = self.forward(x_valid.T)
                loss_val= self.loss(y_valid.T,y_hat_val)
                print("val_loss: {}....".format(loss_val),end=" ")
                ######
                #calc metrics
            avg_loss= np.array(losses).mean()
            if(avg_loss<0.05):
                print("Stopping early because loss converged to a small number")
                print("losses avg=",avg_loss)
                break
            else: print("losses avg=",avg_loss)

                

        print("Finished....") 
            
            
        

    
    def predict(self,x_test): #data dim is NxD .. N no of examples.. D no of dimension
#         print("x_test:", x_test.shape)
        y_hat= self.forward(x_test.T).T
#         return y_hat
        print(y_hat)
        y_hat[y_hat>0.5]=1
        y_hat[y_hat<=0.5]=0
        return y_hat
                    
    def print_weights(self):
        for i in range(len(self.layers)):
            print("layer i= ",i,end=" ")
            self.layers[i].print_weights()
    def print_shapes(self):
        for layer in self.layers:
            layer.print_shapes()
    
    def save_model(self,path):
        model=[self.layers,self.optimizer,self.loss]

        file=open(path,"wb")
        print("dumped model: ",model)

        pickle.dump(model,file)

        file.close()

    def load_model(self,path):
        file=open(path,"rb")

        model=pickle.load(file)

        file.close()
        print("loaded model: ",model)
        
        self.layers,self.optimizer,self.loss=model
        self.initialize_loss()

    

In [33]:
print(tf_x_train.shape)
print(tf_x_test.shape)
print(tf_x_val.shape)
print(Y_train.shape)
print(Y_test.shape)

(39999, 3594)
(13334, 3594)
(13333, 3594)
(39999, 1)
(13334, 1)


In [34]:

adam= AdamOptimizer(beta1 = 0.9,beta2 = 0.99,alpha=0.1,eps=0.001)
nn = NN(optimizer=adam)

nn.add_layer(tf_x_train.shape[1],64,activation="relu",name="l1")
nn.add_layer(64,32,activation = "relu",name="l2")
nn.add_layer(32,8,activation = "relu",name="l4")
nn.add_layer(8,1,activation = "sigmoid",name="l5")



In [35]:
nn.fit(tf_x_train,Y_train,validation_data=[tf_x_val,Y_val],batch_size=32,epochs=1)
# nn.load_model("modelDump.joblib")


Epoche 1/1
val_loss: 0.3606215811584297.... losses avg= 0.3859286476632469
Finished....


In [36]:
y_pred=nn.predict(tf_x_test)

[[0.53186739]
 [0.11319384]
 [0.90384396]
 ...
 [0.01074185]
 [0.94596363]
 [0.94911404]]


In [37]:
print(len(y_pred))
np.unique(y_pred,return_counts=True)

13334


(array([0., 1.]), array([6271, 7063]))

In [38]:
# Y_test
print(classification_report(y_pred,Y_test))


              precision    recall  f1-score   support

         0.0       0.81      0.86      0.83      6271
         1.0       0.87      0.82      0.84      7063

    accuracy                           0.84     13334
   macro avg       0.84      0.84      0.84     13334
weighted avg       0.84      0.84      0.84     13334



In [20]:
#dump model
nn.save_model("modelDump.joblib")

dumped model:  [[<__main__.Layer object at 0x7f4c546cbfa0>, <__main__.Layer object at 0x7f4c5461d0a0>, <__main__.Layer object at 0x7f4c546cbdf0>, <__main__.Layer object at 0x7f4c5461d3a0>], <__main__.AdamOptimizer object at 0x7f4c546cb8b0>, <bound method NN._binary_cross_entropy of <__main__.NN object at 0x7f4c546cbac0>>]


In [None]:
file=open("tfidfVectorizerDump.joblib","wb")
pickle.dump(file,vectorizer)