In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.multiclass import OneVsRestClassifier

In [2]:
df = pd.read_csv("Multi Label Stackoverflow.csv")

In [7]:
df.shape

(11838, 2)

In [6]:
df.isnull().sum()

Text    0
Tags    0
dtype: int64

In [5]:
df.drop(axis =1, columns=['Unnamed: 0'],inplace=True)
df.dropna(axis=0,inplace=True)

In [55]:
df.head()

Unnamed: 0,Text,Tags
0,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
1,adding scripting functionality to net applicat...,"['c#', '.net']"
2,should i use nested classes in this case i am ...,['c++']
3,homegrown consumption of web services i have b...,['.net']
4,automatically update version number i would li...,['c#']


In [56]:
import ast

In [57]:
type(df['Tags'][0])
# it should be list instead of str . as internal data can be picked.

str

In [58]:

df['Tags'] = df['Tags'].apply(lambda x : ast.literal_eval(x)  )

In [60]:
type(df['Tags'][0])


list

In [61]:
# changing all tags into one hot coding
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Tags'])

In [62]:
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [64]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [65]:
y.shape

(11838, 20)

In [66]:
multilabel.classes_.shape

(20,)

In [69]:
# below dataframe shows how all target values are converetd into binary
pd.DataFrame(y,columns=multilabel.classes_).head()

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
# now working on the X data. applying TFIDF on text
# text preprocessing is skipped

In [72]:
tfidf = TfidfVectorizer(analyzer='word',max_features=1000, ngram_range=(1,3), stop_words='english')

In [73]:
X = tfidf.fit_transform(df['Text'])

In [74]:
tfidf.vocabulary_

{'aspnet': 67,
 'site': 795,
 'got': 382,
 'experience': 314,
 'creating': 210,
 'default': 231,
 'xml': 996,
 'file': 337,
 'working': 985,
 'properly': 672,
 'menu': 551,
 'controls': 195,
 'need': 574,
 'way': 965,
 'users': 933,
 'create': 207,
 'modify': 560,
 'pages': 616,
 'page': 615,
 'standard': 821,
 'adding': 25,
 'functionality': 364,
 'net': 578,
 'applications': 50,
 'little': 511,
 'game': 366,
 'written': 992,
 'uses': 934,
 'database': 217,
 'wanted': 963,
 'implement': 422,
 'function': 363,
 'mean': 547,
 'interface': 452,
 'class': 146,
 'implements': 426,
 'public': 678,
 'contains': 189,
 'called': 118,
 'make': 534,
 'thing': 877,
 'like': 499,
 'source': 806,
 'code': 153,
 'compile': 168,
 'use': 928,
 'just': 472,
 'add': 23,
 'tell': 868,
 'application': 49,
 'assembly': 69,
 'means': 548,
 'possible': 648,
 'new': 580,
 'language': 484,
 'extra': 321,
 'write': 990,
 'script': 754,
 'public class': 679,
 'source code': 807,
 'nested': 577,
 'classes': 147,


In [77]:
X.shape

(11838, 1000)

In [81]:
len(tfidf.get_feature_names())

## features name is controlled not to go up than 1000

1000

In [88]:
# splitting the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [87]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7931, 1000), (3907, 1000), (7931, 20), (3907, 20))

In [89]:
lr = LogisticRegression()

In [90]:
import numpy as np


In [92]:
np.minimum([0,0,1],[0,0,1])

array([0, 0, 1])

In [107]:
def jaccard(y_true,y_pred):
    j=np.minimum(y_true,y_pred).sum(axis=1 )/ np.maximum(y_true,y_pred).sum(axis=1)
    return j.mean()*100

In [96]:
clf = OneVsRestClassifier(lr)
clf.fit(X_train,y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None)

In [97]:
y_pred = clf.predict(X_test)

In [108]:
jaccard(y_test,y_pred)

37.98950601484515

### Multilable accuracy comes out around 38 %

In [115]:
# now trying with SVM model

In [116]:
from sklearn.svm import LinearSVC

In [117]:
SVC = LinearSVC( C= 1.5, penalty='l1', dual=False)
clf = OneVsRestClassifier(SVC)
clf.fit(X_train,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.5, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None)

In [118]:
y_pred = clf.predict(X_test)

In [119]:
jaccard(y_test,y_pred)

53.73261667093252

### Testing our model with real time data

In [143]:
x = ['While writing code for natural language processing, tfidfvetorizer python library is not responding properly, can we do it in Java also']

In [144]:
xt=tfidf.transform(x)

In [145]:
xt.shape

(1, 1000)

In [146]:
y_pred=clf.predict(xt)

In [147]:
y_pred

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [148]:
multilabel.inverse_transform(y_pred)

[('java', 'python')]

# Nice ! Multilabel classification is done