#Data Reading

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth',100)

In [4]:
Data=pd.read_excel('/content/drive/MyDrive/Ankit/CONSTRAINT-2021 october2020 /english/Constraint_English_Train.xlsx')
Data.dropna(inplace = True)
Data.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In general the discrepancies in death counts between dif...,real
1,2,States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of thos...,real
2,3,Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag https://t.co...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testing laboratories in India and as on 25th August 2020...,real
4,5,Populous states can generate large case counts but if you look at the new cases per million toda...,real


In [5]:
Data.columns=['id','Post','label']

#Data Preprocessing

removing stopwords

In [6]:

import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
def remove_sw(txt):
    no_sw =" ".join([a for a in txt.split() if a not in stop])
    return no_sw
Data['Post'] = Data['Post'].apply(lambda x: remove_sw(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


rem0ving emoji

In [7]:


import re

def deEmojify(txt):
    regrex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',txt)

Data['Post']=Data['Post'].apply(lambda x:deEmojify(x))

remove URL

In [8]:

def remove_URL(txt):
    url= re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",txt)

Data['Post']=Data['Post'].apply(lambda x:remove_URL(x))

#removing numbers
Data['Post']=Data['Post'].str.replace(r'\d+','')
Data.head()

Unnamed: 0,id,Post,label
0,1,The CDC currently reports deaths. In general discrepancies death counts different sources small...,real
1,2,States reported deaths small rise last Tuesday. Southern states reported deaths.,real
2,3,Politically Correct Woman (Almost) Uses Pandemic Excuse Not Reuse Plastic Bag #coronavirus #nas...,fake
3,4,#IndiaFightsCorona: We #COVID testing laboratories India th August tests done : @ProfBhargava...,real
4,5,Populous states generate large case counts look new cases per million today smaller states show...,real


# Tokenization, Encoding, padding after preprocessing

In [9]:
import keras
from keras.preprocessing.text import Tokenizer
tok1 = Tokenizer(char_level=False, filters ='!"$%&@()*+,-./:;”“<=>?[\\]^_`{|}~\t\n', lower = True)

In [10]:
tok1.fit_on_texts(Data['Post'])
words=len(tok1.word_counts)
words

13718

In [11]:
word_list=[tok1.word_index]
#word_list

In [12]:
encoded=tok1.texts_to_sequences(Data['Post'])
print(Data['Post'][0])
encoded[0]

The CDC currently reports  deaths. In general discrepancies death counts different sources small explicable. The death toll stands roughly  people today.


[6,
 91,
 223,
 209,
 10,
 47,
 447,
 6628,
 88,
 1166,
 389,
 1645,
 650,
 6629,
 6,
 88,
 615,
 513,
 1646,
 7,
 22]

In [13]:
##         Padding encoded sequence of words
from keras.preprocessing import sequence
max_length=20
padd = sequence.pad_sequences(encoded, maxlen=max_length, padding='post')
padd

array([[  91,  223,  209, ..., 1646,    7,   22],
       [  11,   17,   10, ...,    0,    0,    0],
       [4721, 1776,  390, ...,    0,    0,    0],
       ...,
       [  12,  224,  109, ...,    0,    0,    0],
       [2786,  983, 2663, ...,    0,    0,    0],
       [  83,   51,   28, ...,   69,  148,  158]], dtype=int32)

#Label Encoding

In [14]:
import numpy as np
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
labelEncode=preprocessing.LabelEncoder()
labelEncode.fit(Data['label'])
print (labelEncode.classes_)
train_labelEncode=labelEncode.transform(Data['label'])
label=to_categorical(np.asarray(train_labelEncode))
label

['fake' 'real']


array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

# importing libraries for model creation

In [15]:

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding,MaxPool1D
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

# Preparing Embedding layer

In [16]:
Embedding_Layer=Embedding(input_dim = words + 1, output_dim = 100,input_length=max_length)


#CNN model

In [17]:
model = Sequential()
model.add(Embedding_Layer)
model.add(Conv1D(filters =32,kernel_size=2, activation='relu'))
model.add(MaxPooling1D(5, strides=1,padding='same'))
model.add(Conv1D(filters =16,kernel_size=3, activation='relu'))
model.add(MaxPooling1D(3, strides=1,padding='same'))
model.add(Flatten())
model.add(Dense(64, activation = 'relu'))
model.add(Dense(2, activation = 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 100)           1371900   
_________________________________________________________________
conv1d (Conv1D)              (None, 19, 32)            6432      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 19, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 17, 16)            1552      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 17, 16)            0         
_________________________________________________________________
flatten (Flatten)            (None, 272)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                1

#Model Compilation

In [18]:
## compile

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

#model Training

In [19]:
model.fit(padd,label,epochs=20,verbose=1,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb891f7db90>

#Validation Data reading

In [21]:
vData =pd.read_excel('/content/drive/MyDrive/Ankit/CONSTRAINT-2021 october2020 /english/Constraint_English_Val.xlsx')
vData.dropna(inplace = True)
vData.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 ...,fake
1,2,11 out of 13 people (from the Diamond Princess Cruise ship) who had intially tested negative in ...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus And Can Be Treated With Aspirin",fake
3,4,Mike Pence in RNC speech praises Donald Trump’s COVID-19 “seamless” partnership with governors a...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #COVID19 data and government announcement. Get more ...,real


#validation data preprocessing

In [22]:
## doing preprocessing as training

vData['tweet'] =vData['tweet'].apply(lambda x: remove_sw(x))
vData['tweet']=vData['tweet'].apply(lambda x:deEmojify(x))
vData['tweet']=vData['tweet'].apply(lambda x: remove_URL(x))
vData['tweet']=vData['tweet'].str.replace(r'\d+','')

#validation data encoding,padding

In [23]:
encoded1 =tok1.texts_to_sequences(vData['tweet'])
print(vData['tweet'][0])
encoded1[0]

Chinese converting Islam realising muslim affected #Coronavirus #COVD country


[245, 9376, 4298, 856, 441, 19, 86]

In [24]:
padded = sequence.pad_sequences(encoded1, maxlen=max_length, padding='post')
padded[0]

array([ 245, 9376, 4298,  856,  441,   19,   86,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

#validation label encoding

In [25]:
val_labelEncode=labelEncode.transform(vData['label'])
val_label=to_categorical(np.asarray(val_labelEncode))
val_label[0]

array([1., 0.], dtype=float32)

# validation model prediction

In [26]:
val_predictions = model.predict(padded)
val_predictions1 = np.zeros_like(val_predictions)
val_predictions1[np.arange(len(val_predictions)),val_predictions.argmax(1)] = 1

#validation Data Classification Report

In [27]:
#Accuracy 
from sklearn.metrics import classification_report
print(classification_report(val_label,val_predictions1))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1020
           1       0.94      0.95      0.94      1120

   micro avg       0.94      0.94      0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140
 samples avg       0.94      0.94      0.94      2140



#reading testing data

In [29]:

tData=pd.read_excel('/content/drive/MyDrive/Ankit/CONSTRAINT-2021 october2020 /english/English_Test.xlsx')
tData.head()

Unnamed: 0,id,tweet
0,1,Our daily update is published. States reported 734k tests 39k new cases and 532 deaths. Current ...
1,2,Alfalfa is the only cure for COVID-19.
2,3,President Trump Asked What He Would Do If He Were To Catch The Coronavirus https://t.co/3MEWhusR...
3,4,States reported 630 deaths. We are still seeing a solid national decline. Death reporting lags a...
4,5,This is the sixth time a global health emergency has been declared under the International Healt...


#preprocessing as training


In [30]:
tData['tweet'] =tData['tweet'].apply(lambda x: remove_sw(x))
tData['tweet']=tData['tweet'].apply(lambda x:deEmojify(x))
tData['tweet']=tData['tweet'].apply(lambda x: remove_URL(x))
tData['tweet']=tData['tweet'].str.replace(r'\d+','')

#encoding and padding test data

In [31]:
encoded2 =tok1.texts_to_sequences(tData['tweet'])
print(tData['tweet'][0])
encoded2[0]

Our daily update published. States reported k tests k new cases  deaths. Current hospitalizations fell k first time since June . 


[46,
 49,
 40,
 103,
 11,
 17,
 38,
 8,
 38,
 5,
 2,
 10,
 260,
 233,
 1414,
 38,
 81,
 89,
 83,
 310]

In [32]:
#padding

t_padded = sequence.pad_sequences(encoded2, maxlen=max_length, padding='post')
t_padded[0]

array([  46,   49,   40,  103,   11,   17,   38,    8,   38,    5,    2,
         10,  260,  233, 1414,   38,   81,   89,   83,  310], dtype=int32)

#label prediction


In [33]:
test_predictions = model.predict(t_padded)

test_predictions1 = np.argmax(test_predictions,axis=1)

test_predictions1

array([1, 0, 0, ..., 1, 1, 1])

#adding predicted label as colum to dataFrame "tData"

In [34]:
tData['Predictionlabel']=labelEncode.inverse_transform(test_predictions1)
tData

Unnamed: 0,id,tweet,Predictionlabel
0,1,Our daily update published. States reported k tests k new cases deaths. Current hospitalization...,real
1,2,Alfalfa cure COVID-.,fake
2,3,President Trump Asked What He Would Do If He Were To Catch The Coronavirus #donaldtrump #corona...,fake
3,4,States reported deaths. We still seeing solid national decline. Death reporting lags approximat...,real
4,5,This sixth time global health emergency declared International Health Regulations easily severe-...,real
...,...,...,...
2135,2136,#CoronaVirusUpdates: State-wise details Total Confirmed #COVID cases (till September AM) ➡️St...,real
2136,2137,"Tonight (midnight) onwards Disaster Management Act implemented across country. According update,...",fake
2137,2138,new cases #COVIDNigeria; Plateau- Enugu- Oyo- Lagos- Rivers- FCT- Kaduna- Bauchi- Delta- Ekiti-...,real
2138,2139,RT @CDCemergency: #DYK? @CDCgov’s One-Stop Shop #COVID Resources section communicating people ag...,real


#generating CSV file

In [35]:
#EnglishTaskFile=tData[['id','Predictionlabel']]
#EnglishTaskFile.head()
tData.to_csv('team_iiit_dwd_1.csv',index=False)