## Tag prediction for stackOverflow questions

In [1]:
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Ambika
[nltk_data]     Sadhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Ambika
[nltk_data]     Sadhu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### EDA

In [64]:
df = pd.read_csv('Questions.csv',encoding = 'ISO-8859-1')

In [6]:
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [65]:
tags = pd.read_csv('Tags.csv',encoding = 'ISO-8859-1')

In [8]:
tags["Id"]

0                80
1                80
2                80
3                90
4                90
             ...   
3750989    40143360
3750990    40143360
3750991    40143380
3750992    40143380
3750993    40143380
Name: Id, Length: 3750994, dtype: int64

#### Modify the dataset so that same tags are grouped together

In [66]:
tags['Tag'] = tags['Tag'].astype(str)
grouped =tags.groupby("Id")['Tag'].apply(lambda x: ' '.join(x))

In [10]:
grouped.head()

Id
80                            flex actionscript-3 air
90       svn tortoisesvn branch branching-and-merging
120                               sql asp.net sitemap
180    algorithm language-agnostic colors color-space
260           c# .net scripting compiler-construction
Name: Tag, dtype: object

In [67]:
df_new = df.merge(grouped, on = 'Id')

In [None]:
df_new.columns

In [68]:
df_new.drop(['OwnerUserId','CreationDate','ClosedDate'],axis = 1,inplace = True)
df_final = df_new[df_new['Score']>=5] #ignoring questions with very low scores
df_final.drop(['Id','Score'],axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [69]:
df_final.isnull().sum() #no null values in the dataset

Title    0
Body     0
Tag      0
dtype: int64

In [71]:
df_final

Unnamed: 0,Title,Body,Tag
0,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction
...,...,...,...
1263253,CMFCMenuButton not properly repainting when to...,"<p>In an C++ MFC project I'm using <a href=""ht...",c++ mfc accessibility high-contrast cmfcmenubu...
1263399,How can I force file ordering in F# projects u...,<p>I'm trying to work with vscode in my WebSha...,f# vscode
1263454,Why does my result data returned as void* gets...,<p>I am working in a project with a huge legac...,c++
1263609,Do I need to extend FirebaseInstanceIdService ...,<p>I want to manage topic subscription from th...,android firebase firebase-cloud-messaging


In [15]:
df_final["Body"][0]

'<p>I\'ve written a database generation script in <a href="http://en.wikipedia.org/wiki/SQL">SQL</a> and want to execute it in my <a href="http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime">Adobe AIR</a> application:</p>\n\n<pre><code>Create Table tRole (\n      roleID integer Primary Key\n      ,roleName varchar(40)\n);\nCreate Table tFile (\n    fileID integer Primary Key\n    ,fileName varchar(50)\n    ,fileDescription varchar(500)\n    ,thumbnailID integer\n    ,fileFormatID integer\n    ,categoryID integer\n    ,isFavorite boolean\n    ,dateAdded date\n    ,globalAccessCount integer\n    ,lastAccessTime date\n    ,downloadComplete boolean\n    ,isNew boolean\n    ,isSpotlight boolean\n    ,duration varchar(30)\n);\nCreate Table tCategory (\n    categoryID integer Primary Key\n    ,categoryName varchar(50)\n    ,parent_categoryID integer\n);\n...\n</code></pre>\n\n<p>I execute this in Adobe AIR using the following methods:</p>\n\n<pre><code>public static function RunSqlFromFile

### Text preprocessing 

#### Convert the body into text - BeautifulSoup 

In [72]:
df_final['Body'] = df_final['Body'].apply(lambda x: BeautifulSoup(x).get_text()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Body'] = df_final['Body'].apply(lambda x: BeautifulSoup(x).get_text())


In [73]:
df_final['Body'][1]

"Are there any really good tutorials explaining branching and merging with Apache Subversion? \nAll the better if it's specific to TortoiseSVN client.\n"

In [74]:
stop_words = stopwords.words('english')
def clean(text):
    text = text.lower()
    text = re.sub('\s+',' ',text)
    text = text.strip()
    #stopwords, lemmatization
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    new_text = tokenizer.tokenize(text)
    ##print(new_text)
    filtered_text = [word for word in new_text if word not in stop_words]
    #print(filtered_text)
    lemmatizer = WordNetLemmatizer()
    txt = map(lambda x: lemmatizer.lemmatize(x), filtered_text)
    #print(list(txt))
    return list(txt)

In [75]:
df_final["Body"] = df_final["Body"].apply(lambda x: clean(x))
df_final["Title"] = df_final["Title"].apply(lambda x: clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["Body"] = df_final["Body"].apply(lambda x: clean(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["Title"] = df_final["Title"].apply(lambda x: clean(x))


In [20]:
df_final["Body"][0]

['written',
 'database',
 'generation',
 'script',
 'sql',
 'want',
 'execute',
 'adobe',
 'air',
 'application',
 'create',
 'table',
 'trole',
 'roleid',
 'integer',
 'primary',
 'key',
 'rolename',
 'varchar',
 '40',
 'create',
 'table',
 'tfile',
 'fileid',
 'integer',
 'primary',
 'key',
 'filename',
 'varchar',
 '50',
 'filedescription',
 'varchar',
 '500',
 'thumbnailid',
 'integer',
 'fileformatid',
 'integer',
 'categoryid',
 'integer',
 'isfavorite',
 'boolean',
 'dateadded',
 'date',
 'globalaccesscount',
 'integer',
 'lastaccesstime',
 'date',
 'downloadcomplete',
 'boolean',
 'isnew',
 'boolean',
 'isspotlight',
 'boolean',
 'duration',
 'varchar',
 '30',
 'create',
 'table',
 'tcategory',
 'categoryid',
 'integer',
 'primary',
 'key',
 'categoryname',
 'varchar',
 '50',
 'parent_categoryid',
 'integer',
 'execute',
 'adobe',
 'air',
 'using',
 'following',
 'method',
 'public',
 'static',
 'function',
 'runsqlfromfile',
 'filename',
 'string',
 'void',
 'var',
 'file',
 '

In [76]:
df_final["Tag"] = df_final["Tag"].apply(lambda x: x.split())
tag_list = [tag for taglist in df_final["Tag"].values for tag in taglist]
freqs = nltk.FreqDist(tag_list)
most_common = freqs.most_common(100)
most_common = [w[0] for w in most_common]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["Tag"] = df_final["Tag"].apply(lambda x: x.split())


In [77]:
len(set(tag_list))

16632

### Keep only the tags that lie in top 100

In [15]:
def keep_common(tags):
    filtered = []
    for tag in tags:
        if tag in most_common:
            filtered.append(tag)
    return filtered

In [78]:
df_final["Tag"] = df_final["Tag"].apply(lambda x: keep_common(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["Tag"] = df_final["Tag"].apply(lambda x: keep_common(x))


In [79]:
df_final["Tag"] = df_final["Tag"].apply(lambda x: x if len(x)>0 else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["Tag"] = df_final["Tag"].apply(lambda x: x if len(x)>0 else None)


In [80]:
df_final.dropna(axis = 0,inplace = True,subset = ["Tag"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.dropna(axis = 0,inplace = True,subset = ["Tag"])


In [81]:
df_final.shape

(80393, 3)

### Converting text into numbers : TFIDF(Approach 1)

In [82]:
body = [' '.join(x) for x in df_final["Body"].values.tolist()]
title = [' '.join(x) for x in df_final["Title"].values.tolist()]

In [83]:
body

['really good tutorial explaining branching merging apache subversion better specific tortoisesvn client',
 'anyone got experience creating sql based asp net site map provider got default xml file web sitemap working properly menu sitemappath control need way user site create modify page dynamically need tie page viewing permission standard asp net membership system well',
 'something pseudo solved many time never quite found solution stuck problem come way generate n color distinguishable possible n parameter',
 'little game written c us database back end trading card game wanted implement function card script mean essentially interface icard card class implement public class card056 icard contains function called game make thing maintainable moddable would like class card source code database essentially compile first use add change card add database tell application refresh without needing assembly deployment especially since would talking 1 assembly per card mean hundred assembly p

In [129]:
vectorizer_1 = TfidfVectorizer(max_features = 1000)
X1= vectorizer_1.fit_transform(body)

In [131]:
X1 = list(map(lambda x: x.flatten(),X1.toarray()))

In [89]:
vectorizer_2 = TfidfVectorizer(max_features = 1000)
X2= vectorizer_2.fit_transform(title)

In [136]:
X2 = list(map(lambda x: x.flatten(),X2.toarray()))

In [137]:
X= np.hstack((np.array(X1),np.array(X2)))

### MultiLabel Binarizer

Since we have multiple labels in the target.

In [142]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df_final["Tag"])

In [143]:
y.shape

(80393, 100)

### Train Test Split

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Model

In [148]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [178]:
sgd = SGDClassifier()
lr = LogisticRegression()
svc = LinearSVC()

In [177]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
    return jaccard.mean()*100


def print_score(y_pred, clf):
    print("Classifier: ", clf.__class__.__name__)
    print('Jacard score: {}'.format(j_score(y_test, y_pred)))
    print('----')

In [179]:
for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Classifier:  SGDClassifier
Jacard score: 47.481704915313976
----
Classifier:  LogisticRegression
Jacard score: 49.28923855131953
----
Classifier:  LinearSVC
Jacard score: 52.19655036590169
----
