# Importation des bibliothèques :

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import svm
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Importation des données :

In [2]:
df_train= pd.read_csv("../input/sentiment-analysis-on-moroccan-arabic-dialect/train.csv")
df_test = pd.read_csv("../input/sentiment-analysis-on-moroccan-arabic-dialect/test_stage1.csv")

# Text Preprocessing :

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('arabic')

import re
import string
from nltk.stem.isri import ISRIStemmer

punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

translator = str.maketrans('', '', punctuations )

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def removeStopWords(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence_0 = [w for w in word_tokens if not w in stop_words] 
    text = ' '.join([i for i in filtered_sentence_0])
    return text

def removestpWords(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence_1 = [w for w in word_tokens if not w in stpWords ]  
    text = ' '.join([i for i in filtered_sentence_1])
    return text

def removeNoise(text) :
    word_tokens = word_tokenize(text) 
    filtered_sentence_2 = [w for w in word_tokens if not w in noise]   
    text = ' '.join([i for i in filtered_sentence_2])
    return text

def NormalizeArabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return text

def arabic_diacritics(text):
    arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', text)
    return text


def removeNumbers(text):
    """ Removes integers """
    text = ''.join([i for i in text if not i.isdigit()])         
    return text

def stemming(text):
    st = ISRIStemmer()
    stemmed_words = []
    word_tokens = word_tokenize(text) 
    for w in word_tokens:
        stemmed_words.append(st.stem(w))
    stemmed_words = " ".join(stemmed_words)
    return stemmed_words

def remove_english_characters(text):
        #out = re.sub(r"[^\w\s]", '', text)
        out= re.sub(r'[a-zA-Z]+', '', text)
        #out = re.sub(r"\n", '', out)
        #out = re.sub(r"\s+", ' ', out)
        #out = re.sub(r'[^\u0600-\u06FF]', ' ', out)
        return out.strip() 


def removeLetters(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if len(w)>1] 
    text = ' '.join([i for i in filtered_sentence])
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations + string.punctuation)
    return text.translate(translator)

def remove_extra_whitespace(string):
    string = re.sub(r'\s+', ' ', string)
    return re.sub(r"\s{2,}", " ", string).strip()


def cleaning (text):
  # 1.removing extra spaces
    text = re.sub("s+"," ", text)

  # 2.remove repeating char
    text= re.sub(r'(.)\1+', r'\1', text)  
    return text

In [5]:
for index, row in df_train.iterrows():
    #row['comment'] = removeStopWords(row['comment'])
    row['comment'] = arabic_diacritics(row['comment'])
    row['comment'] = NormalizeArabic(row['comment'])
    row['comment'] = removeNumbers(row['comment'])
    row['comment'] = remove_english_characters(row['comment'])
    row['comment'] = row['comment'].translate(translator)
    row['comment'] = remove_punctuations(row['comment'])
    row['comment'] = removeLetters(row['comment'])
    row['comment'] = cleaning(row['comment'])
    row['comment'] = remove_extra_whitespace(row['comment'])
    row['comment'] = stemming(row['comment'])
    new_df_train = pd.DataFrame({'comment': [row['comment']]}, index=[index])
    df_train.update(new_df_train)
    
for index, row in df_test.iterrows():
    #row['comment'] = removeStopWords(row['comment'])
    row['comment'] = NormalizeArabic(row['comment'])
    row['comment'] = arabic_diacritics(row['comment'])
    row['comment'] = removeNumbers(row['comment'])
    row['comment'] = remove_english_characters(row['comment'])
    row['comment'] = row['comment'].translate(translator)
    row['comment'] = remove_punctuations(row['comment'])
    row['comment'] = removeLetters(row['comment'])
    row['comment'] = cleaning(row['comment'])
    row['comment'] = remove_extra_whitespace(row['comment'])
    row['comment'] = stemming(row['comment']) 
    new_df_test = pd.DataFrame({'comment': [row['comment']]}, index=[index])
    df_test.update(new_df_test)
print(df_test.head())
print(df_train.head())

   ID                                            comment
0   1        لقح نعم لنه فابورالناس في مصر را كتخلص عليه
1   2                          اثب لقح اهم في لحد من وفي
2   3  انا لقح جوج رات حمد له عند نعه ديل حيت درت لقح...
3   4  كنا تكد من ان جلل ملك جعل لقح مجا لكل غرب وحت ...
4   5  شعب انا شبع ثقف خرف ءمر ندم عندو هني يضر ملا ا...
   ID                                            comment  label
0   1  انا اوص من هذا نبر لكل لتج الي ركز لقح صدقو ام...      1
1   2  هناك كثر لا فهم قصد كورو ليس صعب علي شبب لكن ص...      1
2   3  حمد له رقم قبل قرن بدل طقه لول ظهر كورو تحر هن...      1
3   4  انا شخص اءد ما فرض سلط من ضرر دلء جوز لقح بهذا...      1
4   5  نفس لشء في دين رشد ركز لقح غلق الي غيه اثن اين...      1


# Features extraction (Tfidf) :

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True , norm='l2', ngram_range=(1, 2))

features_train = tfidf.fit_transform(df_train['comment']).toarray()
labels_train = df_train['label']

print(features_train.shape)

features_test = tfidf.transform(df_test['comment']).toarray()
print(features_test.shape)

(1920, 51771)
(240, 51771)


In [7]:
model_lr = LogisticRegression(random_state=0)
model_lr.fit(features_train, labels_train)

LogisticRegression(random_state=0)

In [8]:
y_pred_lr = model_lr.predict(features_test)

In [9]:
submission = pd.read_csv("../input/sentiment-analysis-on-moroccan-arabic-dialect/sample_submission.csv")

In [10]:
submission['label']= y_pred_lr 
submission = submission[['ID','label']]
submission.to_csv('submission1.csv', index=False)