In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**One Hot Representation**
## using tensorflow

In [None]:
#!pip install tensorflow

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
df=pd.read_csv('/content/drive/MyDrive/AILAB/Constraint_English_Train - Constraint_English_Train.csv')
print(df.shape)
df.head()

(6420, 3)


Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [4]:
#text preprocessing
#data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
corpus=[]
for i in range(0, len(df)):
  tweet=re.sub('[^a-zA-Z]', ' ',df['tweet'][i])
  tweet=tweet.lower()
  tweet=tweet.split()
  tweet=[word for word in tweet if not word in stopwords.words('english')]
  tweet=' '.join(tweet)
  corpus.append(tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
voc_size=10000

In [6]:
one_hot_rep=[one_hot(words, voc_size) for words in corpus]

In [7]:
print(len(one_hot_rep[0]))

18


##**Word Embedding**

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
embedded_doc=pad_sequences(one_hot_rep, padding='pre', maxlen=50) #we are making each sentences of word 50 if len(sentence)<50 we add 0 as prefix words
print(embedded_doc)

[[   0    0    0 ...   27 1275 3261]
 [   0    0    0 ... 2638 8970 5215]
 [   0    0    0 ... 2540  387  247]
 ...
 [   0    0    0 ... 5081 9477   24]
 [   0    0    0 ... 6823   79 5811]
 [   0    0    0 ... 7683  917 3807]]


In [10]:
dim=100
#here dimension is no of features we want for embedding

In [11]:
model=Sequential()
model.add(Embedding(voc_size, dim, input_length=50)) #vocab_size, dimension and sentence length are parameters
model.compile('adam', 'mse') #we are using adam optimizer considering performance metrics as mean square errror


In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           1000000   
                                                                 
Total params: 1,000,000
Trainable params: 1,000,000
Non-trainable params: 0
_________________________________________________________________


In [13]:
print(model.predict(embedded_doc))

[[[ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  [ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  [ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  ...
  [-0.00956062 -0.03110782 -0.04920684 ... -0.03192683 -0.00206124
    0.04824778]
  [ 0.00416192  0.04181411  0.03901022 ... -0.02797275 -0.0132497
   -0.04709367]
  [-0.01169269  0.03993592 -0.00348141 ...  0.0305576   0.0175196
   -0.03242209]]

 [[ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  [ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  [ 0.01116624 -0.00046997  0.0060371  ...  0.04593204 -0.0395422
    0.00133194]
  ...
  [-0.00639695 -0.01476786  0.04791361 ... -0.03899968  0.01020653
    0.03370513]
  [-0.04627291  0.02242002 -0.01416454 ... -0.01043875 -0.04606382
    0.04824883]
  [ 0.03724972 -0.0103653   0.02907406 ...  0.03418144 -0.01071565
   -0.04763461

In [None]:
embedded_doc.shape

(6420, 50)

In [14]:
X=embedded_doc
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.25, random_state=0)

In [15]:
#training using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
tweet_analyse_model=MultinomialNB().fit(X_train, y_train)
y_pred=tweet_analyse_model.predict(X_test)
y_pred.shape

(1605,)

In [16]:
#confusion matrix 
from sklearn.metrics import confusion_matrix  
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[603 169]
 [340 493]]


In [17]:
#calculate accuracy
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
print("Accuracy Score of naive bayes Model: ",accuracy_score(y_test, y_pred))
print("Precision score of naive bayes Model:", precision_score(y_test, y_pred))
print("Recall of naive bayes Model:",recall_score(y_test,y_pred))
print("f1_score of naive bayes Model:",f1_score(y_test,y_pred))

Accuracy Score of naive bayes Model:  0.6828660436137072
Precision score of naive bayes Model: 0.7447129909365559
Recall of naive bayes Model: 0.5918367346938775
f1_score of naive bayes Model: 0.6595317725752509


In [None]:
#print(model.predict(embedded_doc[0]))

In [18]:
#Applying PCA
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

In [19]:
X_train[0]


array([-2554.04048047, -6110.98418942])

In [20]:
#training using Naive Bayes Classifier
# Gaussian is used because features follow a normal distribution.
from sklearn.naive_bayes  import GaussianNB
tweet_analyse_model=GaussianNB().fit(X_train, y_train)
y_pred=tweet_analyse_model.predict(X_test)
y_pred.shape

(1605,)

In [21]:
#calculate accuracy
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
print("Accuracy Score of naive bayes Model: ",accuracy_score(y_test, y_pred))
print("Precision score of naive bayes Model:", precision_score(y_test, y_pred))
print("Recall of naive bayes Model:",recall_score(y_test,y_pred))
print("f1_score of naive bayes Model:",f1_score(y_test,y_pred))

Accuracy Score of naive bayes Model:  0.7115264797507788
Precision score of naive bayes Model: 0.7055555555555556
Recall of naive bayes Model: 0.7623049219687875
f1_score of naive bayes Model: 0.7328332371609925


In [22]:
from sklearn.linear_model import LogisticRegression
log_classifier=LogisticRegression(random_state=0,max_iter=2000)
log_classifier.fit(X_train, y_train)
y_pred=log_classifier.predict(X_test)

In [23]:
print("Accuracy Score of naive bayes Model: ",accuracy_score(y_test, y_pred))
print("Precision score of naive bayes Model:", precision_score(y_test, y_pred))
print("Recall of naive bayes Model:",recall_score(y_test,y_pred))
print("f1_score of naive bayes Model:",f1_score(y_test,y_pred))

Accuracy Score of naive bayes Model:  0.712772585669782
Precision score of naive bayes Model: 0.7089887640449438
Recall of naive bayes Model: 0.7575030012004802
f1_score of naive bayes Model: 0.7324434126523506
