In [1]:
import numpy as np
import pandas as pd

In [2]:
tags = pd.read_csv('../input/stacksample/Tags.csv')

In [3]:
tags.head()

In [4]:
temp = tags['Tag'].value_counts()

In [None]:
#pd.set_option("display.max_rows", None)

In [None]:
# printing the counts - 
#temp

In [5]:
temp = pd.DataFrame(temp)
temp = temp.reset_index()
temp.head()

In [6]:
temp.rename(columns = {'index':'Tag' , 'Tag':'Counts'}, inplace = True)
temp.head()

In [7]:
tags = pd.merge(tags , temp , on = 'Tag')

In [8]:
tags.head()

## **Choosing the 20 immediate tags which have count >= 4021 - for a decent data size and less imbalance in classes i.e., tags**

In [9]:
temp.loc[temp["Counts"] >= 4021, :].index

In [10]:
# stores the row indices of the chosen tags (i.e., tags with counts >= 4021) from the table - temp
chosen_tags_indices = list(range(106, -1, -1))[:20]
chosen_tags_names = list(temp.loc[chosen_tags_indices, "Tag"].values)

In [11]:
#tags.where(tags['Counts']>=20345 , inplace = True)
tags = tags.loc[tags["Tag"].isin(chosen_tags_names), :]

In [12]:
tags.head()

In [None]:
#tags.dropna(inplace = True)
#tags.head()

In [13]:
tags = tags.sort_values(by = 'Id')

In [14]:
tags.head()

In [15]:
tags['Id'] = tags['Id'].astype('int32')

In [16]:
tags.shape

### **Tags file ready**

## **Importing the Questions file -**

In [17]:
q = pd.read_csv('../input/stacksample/Questions.csv' , encoding='latin-1')

In [18]:
q.head()

In [19]:
df = q.drop(['OwnerUserId' ,  'CreationDate' , 'ClosedDate' , 'Score'] , axis =1)

In [20]:
def comb_tags_func(x):
    list_tags = x.values
    tag_comb = ' '.join(list_tags)
    return tag_comb

In [21]:
mapping_dict = tags.groupby('Id')['Tag'].apply(comb_tags_func)

In [None]:
#tags.groupby('Id')['Tag'].count().sort_values(ascending = False)

In [None]:
#mapping_dict

In [22]:
df['tag'] = df['Id'].map(mapping_dict)

In [23]:
df.head()

In [24]:
df.dropna(inplace=True)

In [25]:
df['tag'] = df['tag'].apply(lambda x : x.split())

In [26]:
df.head()

In [27]:
df['Body'].iloc[1]

## **Text cleaning and preprocessing -**

In [28]:
df['Body']=df['Body'].str.replace('<p>',' ')

In [29]:
import re
df['Body']=df['Body'].apply(lambda x : re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)' , ' ' , x))

In [30]:
df['Body']=df['Body'].str.replace('</p>',' ')
df['Body']=df['Body'].str.replace('\n',' ')
df['Body']=df['Body'].str.replace('</a>',' ')

In [31]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))
df['Body']=df['Body'].apply(lambda x : x.lower())
df['Body'] = df['Body'].apply(lambda x : ' '.join([w for w in x.split() if not w in stop_words]))

In [32]:
df['Body']=df['Body'].str.replace('<a href=" ">','')

In [33]:
df['Body'] = df['Body'].apply(lambda x : ' '.join([w for w in x.split() if len(w)>3]))

In [34]:
df['Body'].iloc[1]

## **Model building begins -**

In [35]:
df.head()

In [36]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

s=df['tag']
tag_dummies=pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df.index).add_prefix('tag_')
df=pd.concat([df,tag_dummies],axis=1)

In [37]:
df.head()

In [38]:
pred_features=[feature for feature in df.columns if 'tag_' in feature]
pred_features

In [39]:
y = df[pred_features]

In [40]:
y.shape

In [41]:
df['text'] = df['Body'] + ' ' + df['Title']
df['num_tags'] = df.loc[:, "tag_actionscript-3":"tag_xaml"].sum(axis=1)
df.head()
#X = df['text']

In [42]:
df['num_tags'].value_counts()

### **Maintaining the ratio of the number of tags(for each question) in each fold -**

In [57]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split

In [44]:
df['kfold'] = -1
df = df.sample(frac = 1, random_state = 100).reset_index(drop = True)

In [45]:
skf = model_selection.StratifiedKFold(n_splits = 5)

# Filling the new KFold column - 
for f , (t_ , v_) in enumerate(skf.split(X = df , y = df.num_tags.values)) :
    df.loc[v_ , 'kfold'] = f

In [46]:
df.kfold.value_counts()

In [47]:
lr=LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced',C=0.1, random_state=25)

In [48]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english' , max_features = 2500)

In [49]:
std_obj = StandardScaler()

In [69]:
#sgd_model = SGDClassifier(loss='hinge', alpha=0.00001, penalty='elasticnet', l1_ratio = 0.70, max_iter = 1000, tol = 1e-3, learning_rate = 'adaptive', 
                          #early_stopping = True, n_iter_no_change = 3, class_weight = None, eta0 = 0.1)
#multilabel_model = MultiOutputClassifier(sgd_model, n_jobs=-1)

multilabel_model = MultiOutputClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs = -1)
#multilabel_model = BinaryRelevance(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
#multilabel_model = ClassifierChain(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
#multilabel_model = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
#multilabel_model = MultiOutputClassifier(DecisionTreeClassifier(max_depth = 11, class_weight = 'balanced'), n_jobs=-1)
#multilabel_model = MultiOutputClassifier(lr, n_jobs = -1)

In [70]:
fold_num = 1
#log_reg_coefs = []
#log_reg_intercepts = []
train_acc_scores = []
val_acc_scores = []
train_ham_loss = []
val_ham_loss = []
for kfold_idx in df['kfold'].value_counts().index:
    print(f"Fold num : {fold_num}", end = "\n")
    X_train = df[df['kfold'] != kfold_idx].loc[:, 'text'].values
    X_train = tfidf_vectorizer.fit_transform(X_train)
   #X_train = std_obj.fit_transform(X_train)
    y_train = df[df['kfold'] != kfold_idx].loc[:, "tag_actionscript-3":"tag_xaml"].values
    X_val = df[df['kfold'] == kfold_idx].loc[:, 'text'].values
    X_val = tfidf_vectorizer.transform(X_val)
    #X_val = std_obj.fit_transform(X_val)
    y_val = df[df['kfold'] == kfold_idx].loc[:, "tag_actionscript-3":"tag_xaml"].values
    
    multilabel_model.fit(X_train , y_train)
    #lr.fit(X_train, y_train)
    #log_reg_coefs.append(list(lr.coef_))
    #log_reg_intercepts.append(lr.intercept_)
    preds_train = multilabel_model.predict(X_train)
    preds_val = multilabel_model.predict(X_val)
    
    train_acc_scores.append(accuracy_score(y_true = y_train, y_pred = preds_train))
    val_acc_scores.append(accuracy_score(y_true = y_val, y_pred = preds_val))
    train_ham_loss.append(hamming_loss(y_true = y_train, y_pred = preds_train))
    val_ham_loss.append(hamming_loss(y_true = y_val, y_pred = preds_val))
    
    print(f"Train Accuracy : {train_acc_scores[-1]}")
    print(f"Val Accuracy : {val_acc_scores[-1]}")
    
    print(f"Train Hamming Loss : {train_ham_loss[-1]}")
    print(f"Val Hamming Loss : {val_ham_loss[-1]}")
    
    print("\n")
    fold_num = fold_num + 1
print(f"Mean Train Accuracy : {np.mean(train_acc_scores)}")
print(f"Mean Val Accuracy : {np.mean(val_acc_scores)}")

print(f"Mean Train Hamming Loss : {np.mean(train_ham_loss)}")
print(f"Mean Val Hamming Loss : {np.mean(val_ham_loss)}")

### **OneVsRest and MultiOutput almost give the same results**

## **Training and evaluating the best model -**

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, 'text'], df.loc[:, "tag_actionscript-3":"tag_xaml"], test_size = 0.2, random_state = 25)

In [54]:
# Transforming the X variables before modelling - 
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [55]:
multilabel_model = MultiOutputClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
multilabel_model.fit(X_train, y_train)

In [56]:
print("Train acc. score:", accuracy_score(y_true = y_train, y_pred = multilabel_model.predict(X_train)), end = "\n")
print("Train hamming loss:", hamming_loss(y_true = y_train, y_pred = multilabel_model.predict(X_train)), end = "\n\n")
print("Test acc. score:", accuracy_score(y_true = y_test, y_pred = multilabel_model.predict(X_test)), end = "\n")
print("Test hamming loss:", hamming_loss(y_true = y_test, y_pred = multilabel_model.predict(X_test)), end = "\n\n")

## **Creating a function for the model -**

In [None]:
def model_func(question):
    # processing the question - 
    question = question.replace('<p>',' ')
    
    f1 = lambda x : re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)' , ' ' , x)
    question = f1(question)
    
    question = question.replace('</p>',' ')
    question = question.replace('\n',' ')
    question = question.replace('</a>',' ')
    
    f2 = lambda x : x.lower()
    question = f2(question)
    
    f3 = lambda x : ' '.join([w for w in x.split() if not w in stop_words])
    question = f3(question)
    
    question = question.replace('<a href=" ">','')
    
    f4 = lambda x : ' '.join([w for w in x.split() if len(w)>3])
    question = f4(question)
    
    # applying tfidf - 
    tr_question = tfidf_vectorizer.transform(pd.Series([question]))
    
    # predicting the tags for the input question - 
    return multilabel_model.predict(tr_question)

In [None]:
sample_input = "I'm not able to load tsql in visual-studio and also in chrome , I'm such a noob :("
model_func(sample_input)

In [None]:
sample_output = model_func(sample_input)

In [None]:
# displaying the tags for the sample input - 
mlb.inverse_transform(sample_output)

In [None]:
import pickle
pickle.dump(multilabel_model, open("sof_first_model.sav", 'wb'))

In [None]:
pickle.dump(tfidf_vectorizer, open("tfidf_object.pickle", "wb"))

In [None]:
pickle.dump(mlb, open("multilabel_binarizer.pickle", "wb"))