# **CNN Based Method**

In [None]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import numpy as np
from nltk.collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.svm import SVC as SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from matplotlib import pyplot

stop_words = set(stopwords.words("english"))

def define_model(length, vocab_size):
	# channel 1
	inputs1 = Input(shape=(length,))
	embedding1 = Embedding(vocab_size, 100)(inputs1)
	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
	drop1 = Dropout(0.5)(conv1)
	pool1 = MaxPooling1D(pool_size=2)(drop1)
	flat1 = Flatten()(pool1)
	# channel 2
	inputs2 = Input(shape=(length,))
	embedding2 = Embedding(vocab_size, 100)(inputs2)
	conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
	drop2 = Dropout(0.5)(conv2)
	pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat2 = Flatten()(pool2)
	# channel 3
	inputs3 = Input(shape=(length,))
	embedding3 = Embedding(vocab_size, 100)(inputs3)
	conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
	drop3 = Dropout(0.5)(conv3)
	pool3 = MaxPooling1D(pool_size=2)(drop3)
	flat3 = Flatten()(pool3)
	# merge
	merged = concatenate([flat1, flat2, flat3])
	# interpretation
	dense1 = Dense(10, activation='relu')(merged)
	outputs = Dense(1, activation='sigmoid')(dense1)
	model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
	# compile
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize
	print(model.summary())
	plot_model(model, show_shapes=True, to_file='multichannel.png')
	return model

def create_lexicon(pos, neg):
    lexicon = []
    for file_name in [pos, neg]:
        with open(file_name, 'r') as f:
            contents = f.read()
            for line in contents.split('\n'):
                data = line.strip('\n')
                if data:
                    all_words = word_tokenize(data)
                    lexicon += list(map((lambda x: x), all_words))
    lexicons = []
    for word in lexicon:
        if not word in stop_words:
            lexicons.append(word)
    word_counts = Counter(lexicons) 
    l2 = []
    for word in word_counts:
        if 4000 > word_counts[word]:
            l2.append(word)
    return l2


def samplehandling(sample, lexicons, classification):
    featureset = []
    with open(sample, 'r', encoding="utf8") as f:
        contents = f.read()
        for line in contents.split('\n'):
            data = line.strip('\n')
            if data:
                all_words = word_tokenize(data)
                all_words = list(map((lambda x: x), all_words))
                all_words_new = []
                for word in all_words:
                    if not word in stop_words:
                        all_words_new.append(word)
                features = np.zeros(len(lexicons))
                for word in all_words_new:
                    if word in lexicons:
                        idx = lexicons.index(word)
                        features[idx] += 1
                features = list(features)
                featureset.append([features, classification])
    return featureset

def create_feature_set(pos, neg):
    featuresets = []
    #lexicons = corpus
    lexicons = create_lexicon(pos, neg)
    print(lexicons)
    print(len(lexicons))
    featuresets += samplehandling(pos, lexicons, 1)
    featuresets += samplehandling(neg, lexicons, 0)
    return featuresets

def create_test_data_for_unigram(pos):
    hm_lines = 100000
    lexicons = create_lexicon('pos_hindi.txt', 'neg_hindi.txt')
    testset = []
    with open(pos, "r") as f:
      content = f.read()
      for line in content.split('\n'):
        line = line.strip('\n')
        if not line:
          continue
        featureset = np.zeros(len(lexicons))
        line = word_tokenize(line)
        words = list(set([w for w in line]))
        for w in lexicons:
          if w in words:
            idx = lexicons.index(w.lower())
            featureset[idx] += 1
        featureset = list(featureset)
        testset.append([featureset,1])
          
    return testset

def test_by_unigram():
    featureset = create_feature_set('pos_hindi.txt', 'neg_hindi.txt')
    featureset = np.array(featureset)
    x = list(featureset[:, 0])
    y = list(featureset[:, 1])
    x=np.array(x)
    y=np.array(y)
    model = define_model(3257,926)
    model.fit([x,x,x],y, epochs=10, batch_size=5)
    testset = create_test_data_for_unigram('x_test_hindi.txt')
    testset = np.array(testset)
    test_x = list(testset[:,0])
    test_y=[]
    with open("y_test_hindi.txt", 'r') as f:
      contents = f.read()
      for line in contents.split('\n'):
        test_y.append(int(line))
    test_x=np.array(test_x)
    test_y=np.array(test_y)
    loss,acc = model.evaluate([test_x,test_x,test_x],test_y, verbose=0)
    print("accuracy: %f" % (acc*100))

if __name__ == '__main__':
    test_by_unigram()

['मिकी', 'वाइरस', 'पूरी', 'तरह', 'से', 'मनीष', 'पॉल', 'की', 'फिल्म', 'है', 'और', 'में', 'उनकी', 'इमेज', 'के', 'हिसाब', 'ही', 'दृश्य', 'स्थितियां', 'रची', 'गई', 'थीं', '$', 'ने', 'अपने', 'किरकार', 'को', 'बखूबी', 'निभाया', 'उन्होंने', 'शाहिद', 'डर', ',', 'खौफ', 'संघर्ष', 'जिद', 'जीत', 'व्यक्त', 'किया', 'वे', 'हर', 'भाव', 'दृश्यों', 'नैचुरल', 'लगते', 'हैं', 'कास्टिंग', 'जबरदस्त', 'भाइयों', 'मां', 'पत्\u200d\u200cनी', 'भूमिकाओं', 'भी', 'उपयुक्त', 'कलाकारों', 'का', 'चयन', 'हुआ', 'कोर्ट', 'रूम', 'सीन', 'रियल', 'अक्षय', 'कुमार', 'अल्हड़', 'बेफिक्र', 'अंदाज', 'मुग्ध', 'करता', 'कैमरे', 'सामने', 'उनका', 'गहन', 'आत्मविश्वास', 'मुखर', 'रहता', 'उन्हें', 'कुछ', 'चटपटे', 'संवाद', 'मिले', 'सीमित', 'अपनी', 'प्रतिभा', 'जाहिर', 'कर', 'देते', 'फराज', 'हैदर', 'भारत-पाकिस्तान', 'बीच', 'विभाजन', 'बाद', 'जारी', 'पीढि़यों', 'तनातनी', 'युद्ध', 'माहौल', 'शांति', 'अमन', 'सोच', 'साथ', 'मजाकिया', 'तौर', 'पर', 'पेश', 'विचार', 'प्रशंसनीय', 'गाने', 'एक', 'आकर्षण', 'चुलबुल', '(', 'ऋषि', 'कपूर', ')', 'बुलबुल', 'नीतू', 'स



Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_55 (InputLayer)           [(None, 3257)]       0                                            
__________________________________________________________________________________________________
input_56 (InputLayer)           [(None, 3257)]       0                                            
__________________________________________________________________________________________________
input_57 (InputLayer)           [(None, 3257)]       0                                            
__________________________________________________________________________________________________
embedding_54 (Embedding)        (None, 3257, 100)    92600       input_55[0][0]                   
___________________________________________________________________________________________



accuracy: 81.111109
