In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
#for text pre-processing
import re, string
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix,roc_auc_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec
import requests
from bs4 import BeautifulSoup
import math

#model validation
from sklearn.model_selection import cross_val_score

In [2]:
#1. Load the dataset
df_blog = pd.read_csv("blogtext.csv")
df_blog.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
df_blog.shape
#Since datset is too large it will take a long time for preprocessing 
#Taking the sample

(681284, 7)

In [11]:
df = df_blog.sample(10000, random_state=10) #Sampling
df.shape 

(10000, 7)

In [12]:
#2. Preprocess rows of the “text” column
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [13]:
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [14]:
# Lemmatization 
lematize = WordNetLemmatizer() # stemmer = Porterstemmer()

In [15]:
def lemmatizing(word):
    lematize = WordNetLemmatizer()
    if(word!= lematize.lemmatize(word,pos='a')):
        return lematize.lemmatize(word,pos='a')
    elif(word!= lematize.lemmatize(word,pos='v')):
        return lematize.lemmatize(word,pos='v')
    else:
        return word

In [16]:
def finalpreprocess(string):
    return lemmatizing(stopword(preprocess(string)))
df['clean_text'] = df['text'].apply(lambda x: finalpreprocess(x))
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
262626,3637189,female,27,Consulting,Aquarius,"14,June,2004",All of this rehashing of the ex-b...,rehashing ex boyfriend years opened psychologi...
673780,3389862,female,36,indUnk,Pisces,"01,June,2004",angelemma2000@hotmail.com (Emma's...,angelemma hotmail com emma e mail address ill ...
322119,3295631,female,15,Education,Aquarius,"10,May,2004",On hiatus. Some far off state. A...,hiatus far state also known illinois family re...
52408,2153234,male,27,Religion,Pisces,"23,December,2003",i have been away from internet access for a...,away internet access judging comments either d...
313202,2575612,male,17,Student,Gemini,"03,June,2004",why is my title wayword words? (d...,title wayword words deep thought day well toni...


In [18]:
#3a. Label columns to merge: “gender”, “age”, “topic”, “sign"
df['labels'] = df[df.columns[1:4]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)


In [19]:
#3b
df.drop(['id', 'date','text'],axis=1, inplace=True)

In [21]:
df.drop(['gender', 'age','topic','sign'],axis=1, inplace=True)

In [25]:
df.reset_index(inplace=True,drop=True)

In [27]:
df.drop('index',axis=1, inplace=True)

In [28]:
df.head()

Unnamed: 0,clean_text,labels
0,rehashing ex boyfriend years opened psychologi...,"female,27,Consulting"
1,angelemma hotmail com emma e mail address ill ...,"female,36,indUnk"
2,hiatus far state also known illinois family re...,"female,15,Education"
3,away internet access judging comments either d...,"male,27,Religion"
4,title wayword words deep thought day well toni...,"male,17,Student"


In [29]:
#4. Separate features and labels, and split the data into training and testing
X = df['clean_text']
y = df['labels']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

In [30]:
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(8000,)
(2000,)
(8000,)
(2000,)


In [31]:
#5a Vectorize the features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
X_train_dtm = vect.fit_transform(Xtrain)
X_test_dtm = vect.transform(Xtest)
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(8000, 609982)
(2000, 609982)


In [60]:
#5b.Print the term-document matrix
print(X_train_dtm)
print(X_test_dtm)

  (0, 384562)	3
  (0, 422979)	2
  (0, 50491)	2
  (0, 160505)	1
  (0, 101496)	1
  (0, 530303)	1
  (0, 324274)	1
  (0, 277525)	2
  (0, 474828)	1
  (0, 217724)	1
  (0, 357970)	2
  (0, 585454)	1
  (0, 141459)	1
  (0, 323390)	1
  (0, 521371)	1
  (0, 341799)	1
  (0, 354345)	1
  (0, 549747)	1
  (0, 63368)	1
  (0, 250658)	1
  (0, 142041)	1
  (0, 535741)	1
  (0, 287964)	1
  (0, 545422)	1
  (0, 314426)	1
  :	:
  (7998, 451564)	1
  (7998, 92828)	1
  (7998, 490937)	1
  (7999, 357970)	1
  (7999, 585859)	1
  (7999, 382164)	1
  (7999, 589164)	1
  (7999, 189522)	1
  (7999, 321058)	2
  (7999, 452152)	1
  (7999, 303442)	1
  (7999, 444502)	1
  (7999, 189654)	1
  (7999, 272499)	1
  (7999, 358513)	1
  (7999, 490678)	1
  (7999, 321104)	1
  (7999, 272504)	1
  (7999, 321171)	1
  (7999, 589260)	1
  (7999, 303472)	1
  (7999, 586824)	1
  (7999, 490679)	1
  (7999, 452290)	1
  (7999, 444508)	1
  (0, 1255)	1
  (0, 4315)	1
  (0, 4839)	1
  (0, 74476)	1
  (0, 74830)	1
  (0, 93081)	1
  (0, 93594)	1
  (0, 123659)	1
  (0

In [43]:
# 6.Create a dictionary to get the count of every label i.e.
# the key will be label name and value will be the total count of the label.
dic = {}
for i in df.iloc[:,1:5].columns:
    dic.update({k:v for (k,v) in zip(df[i].value_counts().index, df[i].value_counts().values)})
dic

{'female,16,Student': 343,
 'male,17,Student': 274,
 'female,24,indUnk': 273,
 'male,24,indUnk': 251,
 'female,23,indUnk': 247,
 'female,17,Student': 238,
 'male,16,Student': 233,
 'male,17,indUnk': 188,
 'male,25,indUnk': 180,
 'female,26,indUnk': 178,
 'female,16,indUnk': 175,
 'female,17,indUnk': 171,
 'female,27,indUnk': 161,
 'male,15,Student': 158,
 'female,25,indUnk': 152,
 'female,15,Student': 145,
 'male,16,indUnk': 144,
 'female,15,indUnk': 140,
 'male,27,indUnk': 135,
 'female,14,Student': 128,
 'male,23,indUnk': 124,
 'male,26,indUnk': 115,
 'female,23,Student': 114,
 'male,24,Technology': 114,
 'female,34,indUnk': 108,
 'male,15,indUnk': 95,
 'female,14,indUnk': 94,
 'male,34,indUnk': 93,
 'male,14,Student': 83,
 'female,24,Arts': 82,
 'male,24,Student': 80,
 'male,23,Student': 79,
 'male,27,Technology': 70,
 'female,38,indUnk': 58,
 'male,26,Technology': 56,
 'male,25,Student': 55,
 'male,35,Technology': 54,
 'male,33,indUnk': 54,
 'female,33,indUnk': 53,
 'male,17,Non-Pr

In [44]:
#7 Transform the labels 
from sklearn.preprocessing import MultiLabelBinarizer

In [45]:
mlb = MultiLabelBinarizer()

In [48]:
y_train_lables = mlb.fit_transform(ytrain)
y_test_lables = mlb.transform(ytest)

In [49]:
y_train_lables, y_test_lables

(array([[1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 1],
        [1, 0, 0, ..., 1, 0, 0],
        ...,
        [1, 0, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]),
 array([[1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 1]]))

In [50]:
#8 Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression


In [51]:
clf = LogisticRegression(solver = 'lbfgs')
clf = OneVsRestClassifier(clf)

In [61]:
#lr = LogisticRegression(solver = 'lbfgs')
clf.fit(X_train_dtm, y_train_lables)

OneVsRestClassifier(estimator=LogisticRegression())

In [62]:
y_pred_class = clf.predict(X_test_dtm)
y_pred_class_lables = mlb.fit_transform(y_pred_class)
print(y_pred_class, y_test_lables)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]] [[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 1 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]]


In [63]:
#9 Accuracy score
accuracy_score(y_test_lables, y_pred_class)

0.003

In [64]:
#F1 score
f1_score(y_test_lables, y_pred_class,average = 'macro')

0.22928918693228184

In [67]:
#Average precision score
from sklearn.metrics import average_precision_score
average_precision_score(y_test_lables, y_pred_class)

0.27520132934123553

In [69]:
#Average recall score
from sklearn.metrics import recall_score
recall_score(y_test_lables, y_pred_class,average = 'macro')

0.20810443944050744

In [76]:
#Average is assigned micro
from sklearn.metrics import precision_score
precisionScore_sklearn_microavg = precision_score(y_test_lables, y_pred_class, average='micro')
print("precisionScore_sklearn_microavg:",precisionScore_sklearn_microavg)
# Average is assigned macro
precisionScore_sklearn_macroavg = precision_score(y_test_lables, y_pred_class, average='macro')
print("precisionScore_sklearn_macroavg:",precisionScore_sklearn_macroavg)

precisionScore_sklearn_microavg: 0.8432645243668866
precisionScore_sklearn_macroavg: 0.3383269647013467


In [78]:
#10 Print true label and predicted label for any five examples
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test_lables[5:10], y_pred_class[5:10])

array([[[0, 0],
        [0, 5]],

       [[4, 0],
        [1, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[1, 0],
        [2, 2]],

       [[3, 0],
        [2, 0]],

       [[5, 0],
        [0, 0]],

       [[3, 0],
        [2, 0]],

       [[3, 0],
        [2, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[4, 0],
        [1, 0]],

       [[5, 0],
        [0, 0]],

       [[4, 0],
        [1, 0]],

       [[4, 0],
        [1, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[4, 0],
        [1, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[4, 1],
        [0, 0]],

       [[5, 0],
        [0, 0]],

       [[3, 0],
        [1, 1]],

       [[0, 0],
        [0, 5]],

       [[5, 0]