In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import scipy.sparse as sp

In [2]:
#pre-processing
df = pd.read_csv('emails.csv', encoding= 'ISO-8859-1', names=['sender', 'subject', 'label'])

In [3]:
df.isnull().sum()

sender     0
subject    1
label      0
dtype: int64

In [4]:
df["label"].value_counts()

ad                   190
general knowledge    170
career               117
account              101
important             72
Name: label, dtype: int64

In [5]:
#label encoding: ad: 0, important: 1, career: 2, account: 3, general knowlegde: 4
df.loc[df['label'] == 'ad', 'label',] = 0
df.loc[df['label'] == 'important', 'label',] = 1
df.loc[df['label'] == 'career', 'label',] = 2
df.loc[df['label'] == 'account', 'label',] = 3
df.loc[df['label'] == 'general knowledge', 'label',] = 4

In [6]:
print(df)

                                           sender  \
0      Quizlet <newsletter@lifecycle.quizlet.com>   
1       Unity Technologies <accounts@unity3d.com>   
2    Grammarly Insights <info@send.grammarly.com>   
3       Unity Technologies <accounts@unity3d.com>   
4       Unity Technologies <accounts@unity3d.com>   
..                                            ...   
645               Reddit <noreply@redditmail.com>   
646               Reddit <noreply@redditmail.com>   
647               Reddit <noreply@redditmail.com>   
648         InFormation* <bkismail2000@gmail.com>   
649       "InFormation*" <bkismail2000@gmail.com>   

                                               subject label  
0          You're so close to hitting a 2-week streak.     0  
1    You're about to lose access to Unity Student T...     3  
2            Your weekly stats + one from us: 50% Off!     0  
3           Your Unity Student Plan is about to expire     3  
4           Your Unity Student Plan is about to 

In [7]:
#separating content and labels
x_sender = df['sender']
x_subject = df['subject']
y = df['label']

In [8]:
print(x_subject)

0            You're so close to hitting a 2-week streak.
1      You're about to lose access to Unity Student T...
2              Your weekly stats + one from us: 50% Off!
3             Your Unity Student Plan is about to expire
4             Your Unity Student Plan is about to expire
                             ...                        
645    "1.4k online member Still no Karma!!! I will ..."
646    "[Premier league] Martin Odegaard is Player o..."
647    "[Hanif Berkane] Walid Regragui on semi-final..."
648                                          summer pics
649                                                  NaN
Name: subject, Length: 650, dtype: object


In [9]:
#split df:
#SENDER
x_sender_train, x_sender_test, y_train, y_test = train_test_split(x_sender, y, test_size=0.2, random_state=42)
#SUBJECT
x_subject_train, x_subject_test, y_train, y_test = train_test_split(x_sender, y, test_size=0.2, random_state=42)

In [10]:
print(x_sender.shape)
print(x_subject.shape)
print(x_sender_train.shape)
print(x_sender_test.shape)
print(x_sender_train)

(650,)
(650,)
(520,)
(130,)
333                      Reddit <noreply@redditmail.com>
29        "Victoria | APILayer" <marketing@apilayer.com>
553           Codecademy <learn@itr.mail.codecademy.com>
286       "Victoria | APILayer" <marketing@apilayer.com>
425    Binny - ProjectPro Founder <binnymathews@proje...
                             ...                        
71           Adobe Creative Cloud <mail@email.adobe.com>
106    CollegeXpress <CollegeXpress@email.collegexpre...
270    Quora Digest <english-personalized-digest@quor...
435                          George <george@example.com>
102               Naba de Prezi <email@create.prezi.com>
Name: sender, Length: 520, dtype: object


In [11]:
print(y_test)

637    4
220    2
428    4
326    0
72     2
      ..
515    3
375    2
369    2
244    1
602    4
Name: label, Length: 130, dtype: object


In [12]:
#convert y_train & y_test to int
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [13]:
#feature extraction
final_stopwords_list = stopwords.words('english') + stopwords.words('french')
feature_extraction = TfidfVectorizer(min_df=1, stop_words=final_stopwords_list, lowercase=True, ngram_range=(1,3))
#SENDER
x_sender_train_features = feature_extraction.fit_transform(x_sender_train)
x_sender_test_features  = feature_extraction.transform(x_sender_test)
#SUBJECT
x_subject_train_features = feature_extraction.fit_transform(x_subject_train)
x_subject_test_features  = feature_extraction.transform(x_subject_test)

In [14]:
print(x_subject_train_features)

  (0, 672)	0.3603673762178535
  (0, 791)	0.3603673762178535
  (0, 793)	0.3603673762178535
  (0, 671)	0.3603673762178535
  (0, 790)	0.3603673762178535
  (0, 164)	0.12140601919316846
  (0, 792)	0.3603673762178535
  (0, 658)	0.2760580692692902
  (0, 789)	0.3603673762178535
  (1, 576)	0.28041367565452113
  (1, 56)	0.28041367565452113
  (1, 1040)	0.28041367565452113
  (1, 52)	0.2725026153843299
  (1, 575)	0.28041367565452113
  (1, 55)	0.28041367565452113
  (1, 1039)	0.28041367565452113
  (1, 574)	0.2725026153843299
  (1, 51)	0.5450052307686598
  (1, 1038)	0.28041367565452113
  (1, 164)	0.06349126864102965
  (2, 557)	0.25489422501422315
  (2, 465)	0.25489422501422315
  (2, 524)	0.25489422501422315
  (2, 136)	0.25489422501422315
  (2, 134)	0.25489422501422315
  :	:
  (517, 775)	0.4290742400714032
  (517, 206)	0.3554005054496008
  (517, 164)	0.06574791496898433
  (518, 314)	0.3435498842950533
  (518, 316)	0.3435498842950533
  (518, 313)	0.3435498842950533
  (518, 315)	0.3435498842950533
  (518

In [15]:
#concat features
combined_train_features = sp.hstack([x_sender_train_features, x_subject_train_features])
combined_test_features = sp.hstack([x_sender_test_features, x_subject_test_features])

In [34]:
#training the model
#logistic regression: training using only subject features data
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(combined_train_features, y_train)

In [35]:
#evaluating the trained model:
#predicting the testing data
prediction_test_data = model.predict(combined_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_test_data)

In [36]:
print('accuracy = ', accuracy_on_test_data)

accuracy =  0.7076923076923077


In [45]:
#predictive system
sender = ["Google <no-reply@accounts.google.com>"]
subject = ["Alerte de sécurité"]

#convert to feature vectors
input_sender = feature_extraction.transform(sender)
input_subject = feature_extraction.transform(subject)
input_data_features =  sp.hstack([input_sender, input_subject])

#predicting
prediction = model.predict(input_data_features)
print(prediction)

[3]
