In [9]:
import pandas as pd
import numpy as np
import re 
from nltk.tokenize import word_tokenize,TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import gensim 
import logging

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Conv1D,Dropout,MaxPooling1D,GlobalMaxPool1D,Activation,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#%%  Used Function
def Clean_sentences(sent):
    stemmer = PorterStemmer()
    lemmatizer=WordNetLemmatizer()
    sent= re.sub('[^a-zA-Z]', ' ', sent)
    tokens=word_tokenize(sent)
    tokens=[lemmatizer.lemmatize(stemmer.stem(word.lower())) for word in tokens if word.lower() not in stopwords.words('english')and len(word)>1 and word.isalnum()]
    return tokens
    

In [18]:
#%%  loading cleaning data

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dataset=pd.read_csv('TrainData.csv',names=['Types','Sentences'])
dataset.info()

Types = dataset['Types'].values
Sentences = dataset['Sentences'].values
#worldlist=[]

#for Sentence in Sentences :
#        worldlist.append(Clean_sentences(Sentence))

#print(np.max([len(ligne) for ligne in worldlist]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Types      1998 non-null   object
 1   Sentences  1998 non-null   object
dtypes: object(2)
memory usage: 31.3+ KB


In [16]:
#%%    build word2vect model

#w2v_model = gensim.models.Word2Vec(worldlist, vector_size=300, window=5, min_count=1, workers=10)
#w2v_model.train(worldlist,total_examples=len(worldlist),epochs=10)
# w2v_model.save("word2vec.model")    

w2v_model=gensim.models.Word2Vec.load('word2vec.model')
print(w2v_model,len(w2v_model.wv.vectors),sep='\n')


2022-01-17 12:10:03,938 : INFO : loading Word2Vec object from word2vec.model
2022-01-17 12:10:03,955 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2022-01-17 12:10:03,955 : INFO : setting ignored attribute cum_table to None
2022-01-17 12:10:04,111 : INFO : Word2Vec lifecycle event {'fname': 'word2vec.model', 'datetime': '2022-01-17T12:10:04.111139', 'gensim': '4.1.2', 'python': '3.9.9 (main, Dec 16 2021, 23:13:29) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.0-kali2-amd64-x86_64-with-glibc2.33', 'event': 'loaded'}


Word2Vec(vocab=18237, vector_size=300, alpha=0.025)
18237


In [21]:
#%% preprocessing data 

Keras_tk = Tokenizer(len(w2v_model.wv.vectors))
Keras_tk.fit_on_texts(Sentences)

X_train_sequence = Keras_tk.texts_to_sequences(Sentences)

MAX_LENGTH = 2194

X_train_sequence  = pad_sequences(X_train_sequence, MAX_LENGTH )

print("Vocabulary size={}".format(len(Keras_tk.word_index)))

Vocabulary size=28581


In [23]:
#%% Encoding Labels

Encodelabels=LabelEncoder()

Y= Encodelabels.fit_transform(Types)
Y= to_categorical(Y)
Y.shape

(1998, 5)

In [24]:
#%%  Split train And Test data

X_train, X_test, Y_train, Y_test = train_test_split(np.array(X_train_sequence), Y, train_size=0.85, stratify=Y)

print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

((1698, 5), (300, 5))

In [25]:
#%% Build CNN model using keras

Keras_model = Sequential()
Keras_model.add(Embedding(len(Keras_tk.word_index), 400, input_length=MAX_LENGTH))
Keras_model.add(Conv1D(filters=64, kernel_size=16, padding='same', activation='relu'))
Keras_model.add(MaxPooling1D(pool_size=4))

Keras_model.add(Conv1D(filters=32, kernel_size=16, padding='same', activation='relu'))
Keras_model.add(MaxPooling1D(pool_size=4))
Keras_model.add(Flatten())
Keras_model.add(Dense(256, activation='relu'))
Keras_model.add(Dense(5, activation='softmax'))
Keras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
Keras_model.summary()

2022-01-17 12:17:38.452039: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 12:17:38.516034: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-01-17 12:17:38.516088: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-01-17 12:17:38.517293: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2194, 400)         11432400  
                                                                 
 conv1d (Conv1D)             (None, 2194, 64)          409664    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 548, 64)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 548, 32)           32800     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 137, 32)          0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 4384)              0

In [30]:
#%% training

#Keras_model.fit(X_train, Y_train,validation_split=0.1,epochs=8,batch_size=100,verbose=1)
Keras_model.fit(X_train, Y_train, batch_size=128, epochs=4, validation_data=(X_test, Y_test))
Accuracy = Keras_model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (Accuracy[1]*100))

#Keras_model.save('TextClassification_95.33%.h5')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 83.33%


In [32]:
#%% Prediction
predict = Keras_model.predict(X_test).ravel()
print(predict)

[8.6672242e-10 1.9124096e-17 4.2319231e-04 ... 4.1487347e-06 7.6890962e-13
 9.9999583e-01]


###  Test Part

In [28]:
from keras.models import load_model

In [None]:
Test_dataset=pd.read_csv('Testdata.csv',names=['ID','text'])
Test_dataset.info()
Test_dataset.head()

text = Test_dataset['text'].values
worldlist=[]

for Sentence in text :
        worldlist.append(Clean_sentences(Sentence))

In [31]:
w2v_model=gensim.models.Word2Vec.load('word2vec.model')
MAX_LENGTH = 2194

Keras_tk = Tokenizer(len(w2v_model.wv.vectors))
Keras_tk.fit_on_texts(text)

X_test_sequence = Keras_tk.texts_to_sequences(text)
X_test_sequence  = pad_sequences(X_test_sequence, MAX_LENGTH )

2022-01-17 12:32:45,312 : INFO : loading Word2Vec object from word2vec.model
2022-01-17 12:32:45,345 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2022-01-17 12:32:45,346 : INFO : setting ignored attribute cum_table to None
2022-01-17 12:32:45,542 : INFO : Word2Vec lifecycle event {'fname': 'word2vec.model', 'datetime': '2022-01-17T12:32:45.542868', 'gensim': '4.1.2', 'python': '3.9.9 (main, Dec 16 2021, 23:13:29) \n[GCC 11.2.0]', 'platform': 'Linux-5.15.0-kali2-amd64-x86_64-with-glibc2.33', 'event': 'loaded'}


In [33]:
Keras_model = load_model('TextClassification_95.33%.h5')
Probabilities=Keras_model.predict(X_test_sequence)

Class_Dict={0:'business',1:'entertainment',2:'sport',3:'tech',4:'politics'}
FinalResult={'ID':[],'Proba':[],'Class':[]}

for i in range(len(Probabilities)):
    maxProba=np.max(Probabilities[i])
    Class=[j for j in range(len(Probabilities[i])) if Probabilities[i][j]==maxProba][0]
    Class=Class_Dict[Class]
    FinalResult['ID'].append(i+1)
    FinalResult['Proba'].append(str("%.2f%%" % (maxProba*100)))
    FinalResult['Class'].append(Class)
print(FinalResult)

{'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 

In [None]:

df = pd.DataFrame({'ID': FinalResult['ID'],
                   'Proba': FinalResult['Proba'],
                   'Class':FinalResult['Class']
                   })

df.to_csv('YOUNES AMRI - Result.csv', index=False)