In [0]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import re
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1.Load the dataset (5 points)  a. Tip: As the dataset is large, use fewer rows. Check what is working well on your  machine and decide accordingly.¶

In [0]:
blogtext = pd.read_csv('/content/drive/My Drive/Colab/NLP/blogtext.csv')

In [5]:
blogtext.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
blogtext.info

<bound method DataFrame.info of              id  ...                                               text
0       2059027  ...             Info has been found (+/- 100 pages,...
1       2059027  ...             These are the team members:   Drewe...
2       2059027  ...             In het kader van kernfusie op aarde...
3       2059027  ...                   testing!!!  testing!!!          
4       3581210  ...               Thanks to Yahoo!'s Toolbar I can ...
...         ...  ...                                                ...
681279  1713845  ...         Dear Susan,  I could write some really ...
681280  1713845  ...         Dear Susan,  'I have the second yeast i...
681281  1713845  ...         Dear Susan,  Your 'boyfriend' is fuckin...
681282  1713845  ...         Dear Susan:    Just to clarify, I am as...
681283  1713845  ...         Hey everybody...and Susan,  You might a...

[681284 rows x 7 columns]>

In [7]:
blogtext.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,681284.0,2397802.0,1247723.0,5114.0,1239610.0,2607577.0,3525660.0,4337650.0
age,681284.0,23.93233,7.786009,13.0,17.0,24.0,26.0,48.0


In [8]:
blogtext.shape

(681284, 7)

In [9]:
blogtext.id.value_counts()

449628     4221
734562     2301
589736     2294
1975546    2261
958176     2244
           ... 
3993280       1
3483063       1
4165047       1
3575447       1
3599127       1
Name: id, Length: 19320, dtype: int64

In [10]:
# Users with max no of product ratings
blogtextcount = blogtext["id"].value_counts()
greaterthanhundind = blogtextcount[blogtextcount >200].index
greaterthanhundind

Int64Index([ 449628,  734562,  589736, 1975546,  958176, 1107146,  303162,
             942828, 1270648, 1784456,
            ...
            1131517, 4177216, 1726011, 2169579, 2680773, 1209865, 1032153,
             956218, 1552252, 1624111],
           dtype='int64', length=561)

**The dataset is huge ,to avoid memory issues ,for the current analysis we are filtering out first 10k rows¶**

In [11]:
blogtext_trimmed =  blogtext.head(10000)
blogtext_trimmed.shape

(10000, 7)

# 2.Preprocess rows of the “text” column (7.5 points)

* Remove unwanted characters 
* Convert text to lowercase
* Remove unwanted spaces
* Remove stopwords


In [0]:
data = blogtext.head(10000)

In [0]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent
data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(cleanHtml)
data['text'] = data['text'].apply(cleanPunc)
data['text'] = data['text'].apply(keepAlpha)

**Removing Stop words**

In [0]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
data['text'] = data['text'].apply(removeStopWords)

In [15]:
data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files ...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink m...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eig...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popup...


# 3.As we want to make this into a multi-label classification problem, you are required to merge  all the label columns together, so that we have all the labels together for a particular sentence  (7.5 points)

* Label columns to merge: “gender”, “age”, “topic”, “sign”

* After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image

In [0]:
## Removing date colum as this is not needed for our current analysis
data = data.drop(['date'], axis=1)

In [17]:
data.head()

Unnamed: 0,id,gender,age,topic,sign,text
0,2059027,male,15,Student,Leo,info found pages mb pdf files ...
1,2059027,male,15,Student,Leo,team members drewes van der laag urllink m...
2,2059027,male,15,Student,Leo,het kader van kernfusie op aarde maak je eig...
3,2059027,male,15,Student,Leo,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,thanks yahoos toolbar capture urls popup...


In [18]:
data.shape

(10000, 6)

In [19]:
data.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
text      0
dtype: int64

In [0]:
## As there are no null values ,we proceed with merging of gender,age,topic,sign into labels col

data = data.assign(labels = data.gender.astype(str) + ', ' + \
  data.age.astype(str) + ', ' + data.topic.astype(str) + ', '+ \
  data.sign.astype(str))

In [21]:
data.head()

Unnamed: 0,id,gender,age,topic,sign,text,labels
0,2059027,male,15,Student,Leo,info found pages mb pdf files ...,"male, 15, Student, Leo"
1,2059027,male,15,Student,Leo,team members drewes van der laag urllink m...,"male, 15, Student, Leo"
2,2059027,male,15,Student,Leo,het kader van kernfusie op aarde maak je eig...,"male, 15, Student, Leo"
3,2059027,male,15,Student,Leo,testing testing,"male, 15, Student, Leo"
4,3581210,male,33,InvestmentBanking,Aquarius,thanks yahoos toolbar capture urls popup...,"male, 33, InvestmentBanking, Aquarius"


In [0]:
## Dropping gender/age/topic/sign from the df as they are now merged into labels

data.drop(labels = ['gender','age','topic','sign'], axis=1,inplace=True)

In [23]:
data.head()

Unnamed: 0,id,text,labels
0,2059027,info found pages mb pdf files ...,"male, 15, Student, Leo"
1,2059027,team members drewes van der laag urllink m...,"male, 15, Student, Leo"
2,2059027,het kader van kernfusie op aarde maak je eig...,"male, 15, Student, Leo"
3,2059027,testing testing,"male, 15, Student, Leo"
4,3581210,thanks yahoos toolbar capture urls popup...,"male, 33, InvestmentBanking, Aquarius"


# 4.Separate features and labels, and split the data into training and testing (5 points)¶

**Features is the text column,labels is the labels column**

In [0]:
X = data.drop(labels = ['id','labels'], axis=1)
y = data.drop(labels = ['id','text'], axis=1)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.text,y.labels,test_size=0.3, random_state=42, shuffle=True)

In [0]:
import pickle

pickle_out = open("/content/drive/My Drive/Colab/NLP/X_train.pickle","wb")
pickle.dump(X_train,pickle_out)
pickle_out.close()

pickle_out = open("/content/drive/My Drive/Colab/NLP/y_train.pickle","wb")
pickle.dump(y_train,pickle_out)
pickle_out.close()

pickle_out = open("/content/drive/My Drive/Colab/NLP/X_test.pickle","wb")
pickle.dump(X_test,pickle_out)
pickle_out.close()

pickle_out = open("/content/drive/My Drive/Colab/NLP/y_test.pickle","wb")
pickle.dump(y_test,pickle_out)
pickle_out.close()

In [0]:
pickle_in = open("/content/drive/My Drive/Colab/NLP/X_train.pickle","rb")
X_train = pickle.load(pickle_in) 
pickle_in.close()

pickle_in = open("/content/drive/My Drive/Colab/NLP/y_train.pickle","rb")
y_train = pickle.load(pickle_in) 
pickle_in.close()

pickle_in = open("/content/drive/My Drive/Colab/NLP/X_test.pickle","rb")
X_test = pickle.load(pickle_in) 
pickle_in.close()

pickle_in = open("/content/drive/My Drive/Colab/NLP/y_test.pickle","rb")
y_test = pickle.load(pickle_in) 
pickle_in.close()

# 5.Vectorize the features (5 points)

**A. Create a Bag of Words using count vectorizer**
* Use ngram_range=(1, 2)
* Vectorize training and testing features

**Print the term-document matrix**

In [0]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

In [30]:
print(X_test_dtm.shape)
print(X_train_dtm.shape)
print (vectorizer.get_feature_names()[-30:])

(3000, 481881)
(7000, 481881)
['zun personally', 'zuo', 'zuo le', 'zur', 'zur gumitlikeit', 'zy', 'zy expect', 'zywiec', 'zywiec possibility', 'zz', 'zz top', 'zza', 'zza dong', 'zzz', 'zzzexy', 'zzzexy im', 'zzzs', 'zzzs well', 'zzzz', 'zzzz glad', 'zzzz slept', 'zzzzzz', 'zzzzzzz', 'zzzzzzzzz', 'zzzzzzzzz good', 'zzzzzzzzzz', 'zzzzzzzzzz didnt', 'zzzzzzzzzzzz', 'zzzzzzzzzzzzz', 'zzzzzzzzzzzzzzz']


# 6.Create a dictionary to get the count of every label i.e. the key will be label name and value will  be the total count of the label. Check below image for reference (5 points)

In [31]:
countsgender = blogtext_trimmed['gender'].value_counts().to_dict()

countsage = blogtext_trimmed['age'].value_counts().to_dict()

countstopic = blogtext_trimmed['topic'].value_counts().to_dict()

countssign = blogtext_trimmed['sign'].value_counts().to_dict()

dictlabels = {**countsgender , **countsage, **countstopic, **countssign}
 
print(' dictionary to get the count of every label  :\n---------------------------------')
print(dictlabels)

 dictionary to get the count of every label  :
---------------------------------
{'male': 5916, 'female': 4084, 35: 2315, 36: 1708, 17: 1185, 27: 1054, 24: 655, 15: 602, 34: 553, 16: 440, 25: 386, 23: 253, 26: 234, 14: 212, 33: 136, 39: 79, 38: 46, 13: 42, 37: 33, 41: 20, 45: 16, 42: 14, 46: 7, 43: 6, 44: 3, 40: 1, 'indUnk': 3287, 'Technology': 2654, 'Fashion': 1622, 'Student': 1137, 'Education': 270, 'Marketing': 156, 'Engineering': 127, 'Internet': 118, 'Communications-Media': 99, 'BusinessServices': 91, 'Sports-Recreation': 80, 'Non-Profit': 71, 'InvestmentBanking': 70, 'Science': 63, 'Arts': 45, 'Consulting': 21, 'Museums-Libraries': 17, 'Banking': 16, 'Automotive': 14, 'Law': 11, 'LawEnforcement-Security': 10, 'Religion': 9, 'Accounting': 4, 'Publishing': 4, 'HumanResources': 2, 'Telecommunications': 2, 'Aries': 4198, 'Sagittarius': 1097, 'Scorpio': 971, 'Taurus': 812, 'Aquarius': 571, 'Cancer': 504, 'Libra': 491, 'Pisces': 454, 'Leo': 301, 'Virgo': 236, 'Capricorn': 215, 'Gemini'

# 7.Transform the labels - (7.5 points)
**As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use ​MultiLabelBinarizer​ from sklearn**
* Convert your train and test labels using MultiLabelBinarizer

In [32]:
# transform to dictionary
y_train = [set(i.split(',')) for i in y_train]
y_test = [set(i.split(',')) for i in y_test]
print(y_train[0])
print(y_test[0])

{' indUnk', 'female', ' Sagittarius', ' 24'}
{' 23', ' Consulting', 'male', ' Taurus'}


In [0]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [36]:
y_test[0]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1])

In [37]:
y_train[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [38]:
mlb.classes_

array([' 13', ' 14', ' 15', ' 16', ' 17', ' 23', ' 24', ' 25', ' 26',
       ' 27', ' 33', ' 34', ' 35', ' 36', ' 37', ' 38', ' 39', ' 41',
       ' 42', ' 43', ' 44', ' 45', ' 46', ' Accounting', ' Aquarius',
       ' Aries', ' Arts', ' Automotive', ' Banking', ' BusinessServices',
       ' Cancer', ' Capricorn', ' Communications-Media', ' Consulting',
       ' Education', ' Engineering', ' Fashion', ' Gemini',
       ' HumanResources', ' Internet', ' InvestmentBanking', ' Law',
       ' LawEnforcement-Security', ' Leo', ' Libra', ' Marketing',
       ' Museums-Libraries', ' Non-Profit', ' Pisces', ' Publishing',
       ' Religion', ' Sagittarius', ' Science', ' Scorpio',
       ' Sports-Recreation', ' Student', ' Taurus', ' Technology',
       ' Telecommunications', ' Virgo', ' indUnk', 'female', 'male'],
      dtype=object)

# 8.Choose a classifier - (5 points)

**In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier​ class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use ​LogisticRegression​. It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time because the number of classifiers to train is large.**


In [0]:
clf = LogisticRegression(solver = 'lbfgs')
clf = OneVsRestClassifier(clf)

# 9. Fit the classifier, make predictions and get the accuracy (5 points)

In [0]:
clf.fit(X_train_dtm, y_train)
y_pred_class = clf.predict(X_test_dtm)

In [41]:
print("Accuracy score:",metrics.accuracy_score(y_test, y_pred_class))
print(' \n ')
print("F1 score micro:",metrics.f1_score(y_test, y_pred_class,average='micro'))
print(' \n ')
print("F1 score macro:",metrics.f1_score(y_test, y_pred_class,average='macro'))
print(' \n ')
print("Average precision score:",metrics.precision_score(y_test, y_pred_class,average='weighted'))
print(' \n ')
print("Average Recall score:",metrics.recall_score(y_test, y_pred_class,average='weighted'))

Accuracy score: 0.30966666666666665
 
 
F1 score micro: 0.6394081381011097
 
 
F1 score macro: 0.269767539976007
 
 
Average precision score: 0.7748538293680279
 
 
Average Recall score: 0.5402116843070256


# 10.Print true label and predicted label for any five examples (7.5 points)

In [42]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test)
for i in range(10,15):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test.get(i),
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Title:	ah  korean language  looks  difficult  first    figure    read hanguel koreas surprisingly easy  learn alphabet    characters  seems  easy   vocabulary starts oh    backwards  us sentence structure yikes luckily   many options  us slow witted foreigners  take   language  course  could list    urllink  joongang article says  lot     resources urllink  well  youre  guy    motivation    jeon ji hyun    latest  something   actually star  movies  cfs  hear  means commercial feature   positive  saw  latest movie  sunday night  hard  describe  name  english version  windstruck   korean version    yeochinso   short    ne yeojachingu rul sogayhamnida  id like  introduce    girlfriend surprisingly  titles make sense   like    website korean  english looks quite good actually urllink   movie  shown  theatres  subtitles  special times  info  urllink    list  many   theatres  seoul click urllink  urllink    great reason  learn korean   wasnt already married    went    foreigners  well   loca