In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import the necessary libraries
import warnings
import os
import pandas as pd 
import numpy as np
import re

Import the necessary libraries to authorise Google Collab to connect to the Google Drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Get a list of all files from the shared drive related to the project

In [None]:
file_list = drive.ListFile({'q': "'1MivtY3pJYNEtcCkMvrtaFZxOt8P_NSRl' in parents"}).GetList()

Download the csv file with data to the local drive

In [3]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/Data")
for f in file_list:
  print('title: %s, id: %s' % (f['title'], f['id']))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(f['title'])

Move to the directory where the data is available and load it

In [3]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/Data")
blog_data_df = pd.read_csv("Dataset - blogtext.csv")

The collab notebook is crashing with the entire dataset, so have taken only the first 10000 rows for this assignment

In [4]:
blog_data_df = blog_data_df.head(10000)

In [5]:
blog_data_df.shape

(100000, 7)

In [None]:
blog_data_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [None]:
blog_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [None]:
blog_data_df.isnull().any()

id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool

Drop id and date columns

In [6]:
blog_data_df.drop(['id','date'], axis=1, inplace=True)

In [7]:
blog_data_df['age']=blog_data_df['age'].astype('object')

In [8]:
blog_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   gender  100000 non-null  object
 1   age     100000 non-null  object
 2   topic   100000 non-null  object
 3   sign    100000 non-null  object
 4   text    100000 non-null  object
dtypes: object(5)
memory usage: 3.8+ MB


Remove punctuations from the text

In [9]:
blog_data_df['clean_text']=blog_data_df['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

Convert the text into lower case

In [10]:
blog_data_df['clean_text']=blog_data_df['clean_text'].apply(lambda x: x.lower())

Remove the white space between lines / sentences

In [11]:
blog_data_df['clean_text']=blog_data_df['clean_text'].apply(lambda x: x.strip())

Import stopwords corpus

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

Remove all stopwords from the text

In [14]:
blog_data_df['clean_text']=blog_data_df['clean_text'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords]))

Merge all the other columns into labels columns

In [15]:
blog_data_df['labels']=blog_data_df.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)

Retain the cleaned data and labels

In [16]:
blog_data_df=blog_data_df[['clean_text','labels']]

In [17]:
blog_data_df.head()

Unnamed: 0,clean_text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [18]:
X=blog_data_df['clean_text']
Y=blog_data_df['labels']

Using CountVectorizer transform the text

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))
X=vectorizer.fit_transform(X)

Create dictionary using label names and no. of times it appears

In [20]:
label_counts=dict()

for labels in blog_data_df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

Use MultiLabelBinarizer to carry out one hot encoding of the labels column with multiple values

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))
Y=binarizer.fit_transform(blog_data_df.labels)

Split the data into train and test

In [22]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.2, random_state=42)

Use a combination of OneVsRestClassifier and LogisticRegression models

In [24]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='saga', max_iter=300)
model = OneVsRestClassifier(model)

Fit the model

In [None]:
model.fit(Xtrain,Ytrain)

In [59]:
print("Training Accuracy")
model.score(Xtrain, Ytrain)

Training Accuracy


0.959625

In [60]:
print("Testing Accuracy")
model.score(Xtest,Ytest)

Testing Accuracy


0.3145

As seen above the test accuracy is very low so the model is highly overfit

In [61]:
y_pred = model.predict(Xtest)

Display the various metrics in the classification report

In [65]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
print(accuracy_score(Ytest, y_pred))

0.3145

In [67]:
print(f1_score(Ytest, y_pred, average='micro'))

0.6464891041162227


In [68]:
print(recall_score(Ytest, y_pred, average='micro'))

0.534


Use the inverse transformation to get the labels back from the one hot encodings (binarizer)

In [76]:
y_pred_inversed = binarizer.inverse_transform(y_pred)
Ytest_inversed = binarizer.inverse_transform(Ytest)

Display the actual and predicted labels for 5 text items

In [81]:
for i in range(5):
  print('Output:\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(','.join(Ytest_inversed[i]), ','.join(y_pred_inversed[i])))

Output:
True labels:	23,Consulting,Taurus,male
Predicted labels:	male


Output:
True labels:	17,Aquarius,indUnk,male
Predicted labels:	male


Output:
True labels:	35,Aries,Technology,male
Predicted labels:	Aries,male


Output:
True labels:	23,Aquarius,Automotive,female
Predicted labels:	17,female,indUnk


Output:
True labels:	34,Sagittarius,female,indUnk
Predicted labels:	34,Sagittarius,female,indUnk


