In [1]:
#connect to google drive
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load formatted Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Just to check if data is correctly saved or not
from numpy import load
x = load('/content/drive/MyDrive/LablFiles/Stat_NL_Project/x.npy',allow_pickle=True)
y = load('/content/drive/MyDrive/LablFiles/Stat_NL_Project/y.npy',allow_pickle=True)

In [3]:
x1 = pd.DataFrame(x,columns={'ProcessedText','text'})

In [4]:
x1.head(1)

Unnamed: 0,text,ProcessedText
0,It's 9.51 pm now. Sitting in my study r...,9 51 pm now sit studi room bo liaoz feel f ked...


In [5]:
y

array([['female', 16, 'Student', 'Cancer'],
       ['male', 17, 'indUnk', 'Virgo'],
       ['female', 15, 'indUnk', 'Taurus'],
       ...,
       ['female', 34, 'Education', 'Virgo'],
       ['female', 23, 'Student', 'Leo'],
       ['male', 23, 'indUnk', 'Scorpio']], dtype=object)

In [6]:
y1 = pd.DataFrame(y,columns={'c1','c2','c3','c4'})

In [7]:
y1.head(2)

Unnamed: 0,c4,c3,c1,c2
0,female,16,Student,Cancer
1,male,17,indUnk,Virgo


In [8]:
y1.rename(columns={'c1':'sign','c2':'topic','c3':'gender','c4':'age'},inplace=True)

In [9]:
y1.head(2)

Unnamed: 0,age,gender,sign,topic
0,female,16,Student,Cancer
1,male,17,indUnk,Virgo


In [10]:
finaldf = pd.concat([x1,y1],axis=1)

In [11]:
finaldf.head(2)

Unnamed: 0,text,ProcessedText,age,gender,sign,topic
0,It's 9.51 pm now. Sitting in my study r...,9 51 pm now sit studi room bo liaoz feel f ked...,female,16,Student,Cancer
1,The sun was setting and I was high up on a ...,sun set high hill look barren tree front still...,male,17,indUnk,Virgo


In [12]:
finaldf['labels'] =finaldf.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)

In [13]:
finaldf1 = finaldf[['ProcessedText','labels']]

In [14]:
finaldf1.head(2)

Unnamed: 0,ProcessedText,labels
0,9 51 pm now sit studi room bo liaoz feel f ked...,"[16, female, Cancer, Student]"
1,sun set high hill look barren tree front still...,"[17, male, Virgo, indUnk]"


# Build Model - Using CountVectoriser Method

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
#Split data into 70:30 :: train : test 
X_train, X_test, y_train, y_test = train_test_split(finaldf1.ProcessedText.values,
                                                    finaldf1.labels.values,
                                                    test_size=0.3,
                                                    random_state=32
                                                    )

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
vect = CountVectorizer(ngram_range=(1,2)) 

In [19]:
x_train1 = vect.fit_transform(X_train)
x_test1 = vect.transform(X_test)

In [20]:
x_train1.shape

(4768, 395597)

In [21]:
x_test1.shape

(2044, 395597)

In [22]:
label_counts = dict()

for labels in finaldf1.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [23]:
label_counts.keys()

dict_keys([16, 'female', 'Cancer', 'Student', 17, 'male', 'Virgo', 'indUnk', 15, 'Taurus', 26, 'Fashion', 27, 'Gemini', 25, 'Aries', 'Communications-Media', 'Leo', 'Technology', 38, 'Sagittarius', 24, 23, 'Aquarius', 'Arts', 37, 'Banking', 'Scorpio', 'BusinessServices', 'Pisces', 'Advertising', 'Capricorn', 'Education', 45, 'Telecommunications', 'Non-Profit', 35, 'Science', 13, 39, 14, 48, 36, 'RealEstate', 34, 'Marketing', 'Libra', 'Government', 33, 'InvestmentBanking', 'Religion', 'Biotech', 'Museums-Libraries', 'Chemicals', 'Publishing', 'Engineering', 'Internet', 47, 'Tourism', 'Automotive', 'Military', 46, 'Architecture', 'Law', 'LawEnforcement-Security', 'Construction', 42, 'Accounting', 'Manufacturing', 'HumanResources', 'Transportation', 43, 40, 'Agriculture', 'Consulting', 'Sports-Recreation', 44, 'Maritime', 'Environment', 41])

In [24]:
keys_values = label_counts.items()
label_counts_str = {str(key): str(value) for key, value in keys_values}

In [25]:
label_counts_str.keys()

dict_keys(['16', 'female', 'Cancer', 'Student', '17', 'male', 'Virgo', 'indUnk', '15', 'Taurus', '26', 'Fashion', '27', 'Gemini', '25', 'Aries', 'Communications-Media', 'Leo', 'Technology', '38', 'Sagittarius', '24', '23', 'Aquarius', 'Arts', '37', 'Banking', 'Scorpio', 'BusinessServices', 'Pisces', 'Advertising', 'Capricorn', 'Education', '45', 'Telecommunications', 'Non-Profit', '35', 'Science', '13', '39', '14', '48', '36', 'RealEstate', '34', 'Marketing', 'Libra', 'Government', '33', 'InvestmentBanking', 'Religion', 'Biotech', 'Museums-Libraries', 'Chemicals', 'Publishing', 'Engineering', 'Internet', '47', 'Tourism', 'Automotive', 'Military', '46', 'Architecture', 'Law', 'LawEnforcement-Security', 'Construction', '42', 'Accounting', 'Manufacturing', 'HumanResources', 'Transportation', '43', '40', 'Agriculture', 'Consulting', 'Sports-Recreation', '44', 'Maritime', 'Environment', '41'])

Use MultiLabelBinarizer function for multi label use case

In [26]:
from sklearn.preprocessing import MultiLabelBinarizer

In [27]:
multilabelBiz= MultiLabelBinarizer(classes=sorted(label_counts_str.keys()))

In [28]:
y_train = multilabelBiz.fit_transform(y_train)
y_test = multilabelBiz.transform(y_test)

  .format(sorted(unknown, key=str)))


# Classifier - LogisticRegression

In [29]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [30]:
classi = LogisticRegression(solver='liblinear')
classi = OneVsRestClassifier(classi)

In [31]:
#fit
classi.fit(x_train1, y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

Make prediction

In [40]:
predicted_labels = classi.predict(x_train1)

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
accuracy_score(y_train,predicted_labels)

0.8999580536912751

Perform inverse transform and check labels 

In [43]:
pred_inversed = multilabelBiz.inverse_transform(predicted_labels)
y_test_inversed = multilabelBiz.inverse_transform(y_train)

In [44]:
for i in range(10):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_train[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	55 goth urllink oh goth goth girl good chanc bi freaki pump vien still laugh myself urllink take goth test fuali com time call death rock sometim 80 guess
True labels:	Capricorn,female,indUnk
Predicted labels:	Capricorn,female,indUnk


Title:	bandwidth will not abl go onlin much download much next week titl may impli happen exceed bandwidth limit plea let explain littl someth sympatico isp servic give allow upload download 10 gb roughli everymonth sinc unlimit internet not go cut internet keep download instead charg lot everi gigabyt download upload limit ah well fine download much csi month heh onlin lot resist comput next week next month start februari 26th not long seven day away ugh hope sister will not instant messag much msn know everi charact type upload anoth byte download least twice feel oblig type happen monday bad rememb today ugh decid start work tri catch up tire yet happi face shrug guess leav guy know number someth import come up arv
True labels:	Leo,Student,male

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='weighted'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='weighted'))
    print('Average recall score: ', recall_score(y_val, predicted, average='weighted'))

In [48]:
print('Classifier - LogisticRegression')
print_evaluation_scores(y_train, predicted_labels)

Classifier - LogisticRegression
Accuracy score:  0.8999580536912751
F1 score:  0.9688563101293611


  average, "true nor predicted", 'F-score is', len(true_sum)
  recall = tps / tps[-1]


Average precision score:  0.9486254352995382
Average recall score:  0.9407158836689038


  _warn_prf(average, modifier, msg_start, len(result))


Lets check test performance

In [49]:
predicted_labels_test = classi.predict(x_test1)

In [50]:
accuracy_score(y_test,predicted_labels_test)

0.0019569471624266144

In [51]:
pred_inversed = multilabelBiz.inverse_transform(predicted_labels_test)
y_test_inversed = multilabelBiz.inverse_transform(y_test)

In [52]:
for i in range(10):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	hell offic sinc monday 1stli manpow like 4 person 10 pple 2ndli stupid ghost nick give mr woo manag co alway sneak behind pple back keep ignor us popout email state statist low blah blah blah open eye big see mani us handl wonder head fill grass no brain today thing happen thou patrick ryan around thing get alittl control yet toopid mr liew sup walk around eye close practic shout answer call answer call shit him see tat busi wat hell
True labels:	Engineering,Scorpio,male
Predicted labels:	female


Title:	today town wait anoth coupl hour cam ein late seri writeup town citi visit past fifteen year read one blog archiv
True labels:	Aries,Communications-Media,male
Predicted labels:	male


Title:	make amend earlier post trade tyler anyth make aw day million time well love
True labels:	Libra,Student,female
Predicted labels:	


Title:	hey i be bore no one talk think tell whole stori steubenvil weekend o k
True labels:	Capricorn,indUnk,male
Predicted labels:	


Title:	age sinc last post

# Classifier - SVM

In [83]:
from sklearn.svm import SVC

In [84]:
model = SVC()
model = OneVsRestClassifier(model)

In [85]:
model.fit(x_train1,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='rbf', max_iter=-1,
                                  probability=False, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False),
                    n_jobs=None)

In [89]:
predicted_labels = model.predict(x_test1)

In [90]:
accuracy_score(y_test,predicted_labels)

0.0

# Build Model - Using TF-IDF COuntVectoriser

In [16]:
from sklearn.model_selection import train_test_split
#Split data into 70:30 :: train : test 
X_train, X_test, y_train, y_test = train_test_split(finaldf1.ProcessedText.values,
                                                    finaldf1.labels.values,
                                                    test_size=0.3,
                                                    random_state=26
                                                    )

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vecttfidf = TfidfVectorizer() 

In [20]:
vecttfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
#Check the vocablury size
len(vecttfidf.vocabulary_)

44662

In [22]:
x_train2 = vecttfidf.transform(X_train)
x_test2 = vecttfidf.transform(X_test)

In [23]:
label_counts = dict()

for labels in finaldf1.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [24]:
keys_values = label_counts.items()
label_counts_str = {str(key): str(value) for key, value in keys_values}

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
multilabelBiz= MultiLabelBinarizer(classes=sorted(label_counts_str.keys()))

In [26]:
y_train = multilabelBiz.fit_transform(y_train)
y_test = multilabelBiz.transform(y_test)

  .format(sorted(unknown, key=str)))


In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [28]:
classi2 = LogisticRegression(solver='liblinear')
classi2 = OneVsRestClassifier(classi2)

In [29]:
classi2.fit(x_train2,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
predicted_labels = classi2.predict(x_train2)
accuracy_score(y_train,predicted_labels)

0.0

In [32]:
predicted_labels = classi2.predict(x_test2)
accuracy_score(y_test,predicted_labels)

0.0

In [42]:
pred_inversed = multilabelBiz.inverse_transform(predicted_labels)
y_test_inversed = multilabelBiz.inverse_transform(y_test)

In [44]:
for i in range(50):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	             I was  urlLink spotted  last night.          
True labels:	Pisces,indUnk,male
Predicted labels:	


Title:	           Just finished talking to Jan on the phone 10mins ago and well...you know how that guy influences my mood. Nobody can make me sa happy (or as sad) as I am now except my one and only loooooooove (drool). After coming back from lunch, I still didn't get any sms from him and neither is he online! Then finally after several minutes I got an sms from him telling me that just as he was about to come online, the power went off (I hope it's not some kind of bad sign!). Well, I hate to admit that I kinda doubt him to I asked him if I can call him now, just to make sure if he really is at home and wants to talk to me (well, I didn't tell him exactly THAT! He kill me!). So, I called him up and we talked...and talked...and went on for almost an hour (my boss is gonna kill me when he finds out!). I told him I'm using my boss' account and he told me it's okey and th