## Project | Statistical NLP

1. Load the dataset

In [69]:
import pandas as pd
dataset = pd.read_csv('blogtext.csv')

In [70]:
dataset.shape

(681284, 7)

In [71]:
# df = dataset.iloc[:30000, :]
df = dataset.sample(5000)

In [72]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
590959,47519,male,23,Communications-Media,Sagittarius,"20,March,2004",Been having fun with Jen for her short ...
635609,3452960,female,17,Student,Cancer,"23,June,2004","So, hello. It's me...grooowl...."
232181,3426344,female,16,Student,Scorpio,"25,May,2004",I'm on IM all the time. My s/n is SVrun...
339423,1436696,female,23,Agriculture,Capricorn,"05,June,2004",I really need to get this out of my...
354948,1135724,male,34,Internet,Cancer,"25,May,2004",News from urlLink The Hindu So...


In [73]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

2. Preprocess rows of the “text” column

In [74]:
import re
def processString(myString):
    myStringSpace = myString.strip()
    myString = myStringSpace.lower() 
    otherString = re.sub("[^a-zA-Z]"," ",myString)
    return otherString
df["text"] = df["text"].map(processString)

In [75]:
df["text"].head(2)

590959    been having fun with jen for her short stay  a...
635609    so  hello  it s me   grooowl  so a run down of...
Name: text, dtype: object

3. As we want to make this into a multi-label classification problem, you are required to merge
all the label columns together, so that we have all the labels together for a particular sentence

In [76]:
import numpy as np
df['Labels']=np.array(df.iloc[:,1:5].astype(str)).tolist()

In [77]:
df = df.iloc[:, 6:]

In [78]:
df.head()

Unnamed: 0,text,Labels
590959,been having fun with jen for her short stay a...,"[male, 23, Communications-Media, Sagittarius]"
635609,so hello it s me grooowl so a run down of...,"[female, 17, Student, Cancer]"
232181,i m on im all the time my s n is svrunner b...,"[female, 16, Student, Scorpio]"
339423,i really need to get this out of my head r...,"[female, 23, Agriculture, Capricorn]"
354948,news from urllink the hindu sorry for the c...,"[male, 34, Internet, Cancer]"


4. Separate features and labels, and split the data into training and testing

In [79]:
X = df["text"]
y = df["Labels"]

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

5. Vectorize the features

In [138]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2)
#                              ,analyzer='word',min_df=2,max_df=10
                             , max_features = 50000
                            ) 

train_data_features = vectorizer.fit_transform(X_train)
test_data_features = vectorizer.transform(X_test)

train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()

In [139]:
print ("train_data_features.shape:",train_data_features.shape)
print ("test_data_features.shape:",test_data_features.shape)

train_data_features.shape: (4000, 50000)
test_data_features.shape: (1000, 50000)


In [140]:
from sklearn import preprocessing
trainX = preprocessing.StandardScaler().fit_transform(train_data_features)
testX = preprocessing.StandardScaler().fit_transform(test_data_features)

6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.

In [141]:
pd.Series([j for i in y.tolist() for j in i]).value_counts().to_dict()

{'male': 2554,
 'female': 2446,
 'indUnk': 1814,
 'Student': 1155,
 '17': 622,
 '24': 569,
 '23': 539,
 '16': 523,
 '25': 488,
 'Cancer': 485,
 'Aries': 482,
 'Taurus': 478,
 'Virgo': 450,
 'Libra': 440,
 '26': 422,
 'Scorpio': 413,
 'Leo': 396,
 'Pisces': 379,
 'Aquarius': 376,
 'Gemini': 375,
 'Sagittarius': 374,
 'Capricorn': 352,
 '27': 334,
 'Technology': 306,
 '15': 291,
 'Arts': 251,
 'Education': 241,
 '14': 199,
 '34': 164,
 'Communications-Media': 134,
 'Internet': 133,
 'Non-Profit': 121,
 '33': 120,
 '36': 112,
 '35': 107,
 '13': 100,
 'Engineering': 91,
 '37': 64,
 '38': 60,
 'Law': 58,
 '40': 44,
 'Consulting': 43,
 'Fashion': 42,
 'Publishing': 41,
 'BusinessServices': 40,
 '39': 39,
 'Religion': 39,
 'Government': 39,
 '41': 35,
 'Advertising': 33,
 'Marketing': 33,
 '43': 32,
 'Accounting': 30,
 'Science': 29,
 'Chemicals': 29,
 '45': 29,
 'HumanResources': 29,
 '46': 28,
 'Telecommunications': 28,
 'Banking': 26,
 'Sports-Recreation': 25,
 'RealEstate': 24,
 'Museums-

7. Transform the labels

In [142]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.fit_transform(y_test)

In [143]:
print ("train_data_lables.shape:",y_train_mlb.shape)
print ("test_data_lables.shape:",y_test_mlb.shape)

train_data_lables.shape: (4000, 80)
test_data_lables.shape: (1000, 80)


In [144]:
y_train.head()

63972     [female, 14, indUnk, Sagittarius]
198440          [female, 27, indUnk, Aries]
493047          [male, 23, Student, Taurus]
450332          [female, 17, indUnk, Libra]
73562          [male, 26, Education, Libra]
Name: Labels, dtype: object

In [145]:
y_train_mlb[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

8. Choose a classifier

In [148]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [149]:
clf = LogisticRegression(solver = 'lbfgs', max_iter = 2000 )
clf= OneVsRestClassifier(clf) 

In [150]:
clf.fit(trainX, y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=2000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

9. Fit the classifier, make predictions and get the accuracy

In [151]:
from sklearn import metrics
y_predict_test = clf.predict(testX)
y_predict_train = clf.predict(trainX)

In [153]:
print("Train accuracy score:", metrics.accuracy_score(y_train_mlb,y_predict_train))

Train accuracy score: 0.98525


In [154]:
print ("Classification report:", "Train Accuracy Score: ", round(metrics.accuracy_score(y_train_mlb,y_predict_train),4))
print ("-"*56)
print(metrics.classification_report(y_train_mlb,y_predict_train))
print ("-"*56)
print("")
print ("Classification report:", "Test Accuracy Score: ", round(metrics.accuracy_score(y_test_mlb,y_predict_test),4))
print ("-"*56)
print(metrics.classification_report(y_test_mlb,y_predict_test))
print ("-"*56)
print("")

Classification report: Train Accuracy Score:  0.9852
--------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      0.99      0.99       166
           2       1.00      0.98      0.99       229
           3       1.00      0.99      1.00       423
           4       1.00      0.99      1.00       505
           5       1.00      0.98      0.99       452
           6       1.00      0.99      1.00       433
           7       1.00      0.99      0.99       391
           8       1.00      0.98      0.99       326
           9       1.00      0.99      1.00       275
          10       1.00      0.97      0.98        93
          11       1.00      0.99      0.99       139
          12       1.00      0.97      0.99        80
          13       1.00      0.98      0.99        91
          14       1.00      1.00      1.00        50
          15       1.00  

  _warn_prf(average, modifier, msg_start, len(result))


10. Print true label and predicted label for any five examples

In [155]:
print("Train data: predicted label for any five examples ") 
print("-"*50)
d = []
five_examples = [100, 200, 300, 400, 500]
for i,n in zip(five_examples , range(1,6)):
    d.append((n," | ", list(mlb.inverse_transform(y_train_mlb)[i])," | ", list(mlb.inverse_transform(y_predict_train)[i])))
pd.DataFrame(d, columns=('_example_',"", '_true labels_', "",'_predicted labels_'))

Train data: predicted label for any five examples 
--------------------------------------------------


Unnamed: 0,_example_,Unnamed: 2,_true labels_,Unnamed: 4,_predicted labels_
0,1,|,"[17, Gemini, Student, female]",|,"[17, Gemini, Student, female]"
1,2,|,"[25, Leo, indUnk, male]",|,"[25, Leo, indUnk, male]"
2,3,|,"[17, Aries, indUnk, male]",|,"[17, Aries, indUnk, male]"
3,4,|,"[16, Aries, Religion, male]",|,"[16, Aries, Religion, male]"
4,5,|,"[27, Sagittarius, Technology, male]",|,"[27, Sagittarius, Technology, male]"


In [156]:
print("Test data: predicted label for any five examples ") 
print("-"*50)
d = []
five_examples = [100, 200, 300, 400, 500]
for i,n in zip(five_examples , range(1,6)):
    d.append((n," | ", list(mlb.inverse_transform(y_test_mlb)[i])," | ", list(mlb.inverse_transform(y_predict_test)[i])))
pd.DataFrame(d, columns=('_example_',"", '_true labels_', "",'_predicted labels_'))

Test data: predicted label for any five examples 
--------------------------------------------------


Unnamed: 0,_example_,Unnamed: 2,_true labels_,Unnamed: 4,_predicted labels_
0,1,|,"[35, Taurus, female, indUnk]",|,"[female, indUnk]"
1,2,|,"[34, Internet, Leo, female]",|,"[female, indUnk]"
2,3,|,"[34, Capricorn, female, indUnk]",|,[female]
3,4,|,"[24, Education, Virgo, male]",|,[male]
4,5,|,"[38, Aries, Non-Profit, female]",|,"[indUnk, male]"
