### IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### READING THE DATA

In [None]:
question= pd.read_csv('/content/drive/MyDrive/stackoverflow tag/Questions.csv', encoding='latin')
tags= pd.read_csv('/content/drive/MyDrive/stackoverflow tag/Tags.csv', encoding='latin')

In [None]:
question.head()

In [None]:
tags.head()

In [None]:
print(question.shape,tags.shape)

In [None]:
print(question.Id.nunique(), tags.Id.nunique())

### MERGING THE DATA FRAMES

In [None]:
tags['Tag']= tags['Tag'].astype(str)
grouped_tags = pd.DataFrame(tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags)))
grouped_tags.columns= ['Tag']
print(grouped_tags.head())
grouped_tags['Tag']= grouped_tags['Tag'].astype(str)
grouped_tags['Tag']= grouped_tags['Tag'].apply(lambda x: x.split())
# grouped_tags= grouped_tags.to_frame()
grouped_tags= grouped_tags.sort_values(by='Id')
print(grouped_tags.head())

In [None]:
print(grouped_tags.shape)

1. Merging Question and grouped_answer dataframes to get df
2. Merging df and grouped_answer dataframes to get df

In [None]:
grouped_tags['Ids']= grouped_tags.index
question.columns= ['Ids', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body']
question= question.sort_values(by='Ids')
df= pd.merge(question,grouped_tags,how='left',on='Ids')

In [None]:
df.head()

### REMOVING UNNECESSARY VARIABLES

In [None]:
df.drop(columns=['Ids', 'OwnerUserId', 'CreationDate', 'ClosedDate'],inplace=True)
df.head()

### FILTERING DATA BASED ON SCORE AND MOST FREQUENTLY USED TAGS

In [None]:
print(df.Score.min(), df.Score.max())

In [None]:
# z= df['Tag'].value_counts().sort_values(ascending=False)
# z.index

In [None]:
# df1= df.groupby(by='Tag')['Tag'].count().sort_values(ascending=False).to_frame()
# df1.columns= ['Tag_count']
# df1['Tags']=df1.index

In [None]:
# df.columns= ['Score', 'Title', 'Body', 'Tags']
# df1= pd.merge(df,df1,how='left',on='Tags')
# df1.head()

In [None]:
# df1= df1[df1['Tag_count']>=500]
df1= df[df['Score']>6]
df1.shape

For better prediction we will be using only those tags which have been repeated for atleast 500 times and the score is more than 5. Low scores mean that the question is either erroneous or does not have sufficient information.

In [None]:
type(df1['Tag'])

In [None]:
df1['Tag']

In [None]:
# df1.Tag.value_counts().sort_values(ascending=False)

### CHECKING FOR MISSING VALUES

In [None]:
print(df1.isnull().sum())

print('Shape of df1:',df1.shape)

### CLEANING THE TEXT FOR TITLE AND BODY

1. Removing punctuation
2. Removing HTML tags (if required)
3. Changing text into lowercase
4. Splitting the text into words
5. Removing stopwords

#### PUNCTUATION & HTML TAGS REMOVAL, LOWERCASE, WORD TOKENIZATION

In [None]:
import string
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text= text.replace(punctuation,'')
    return text

In [None]:
df1['Title']= df1['Title'].astype(str)

df1['Title1']= df1['Title'].apply(remove_punctuation)
df1['Title1']=df1['Title1'].str.lower()
df1['Title1']= df1['Title1'].str.split()
df1['Title1'].head()

In [None]:
df1['Body']= df1['Body'].astype(str)
import re

df1['Body1']= df1['Body'].apply(lambda x: re.sub('<[^<]+?>','',x))
df1['Body1'].head()

In [None]:
df1['Body1']= df1['Body1'].apply(remove_punctuation)
df1['Body1']=df1['Body1'].str.lower()
df1['Body1']= df1['Body1'].str.split()
df1['Body1'].head()

#### LEMMATIZATION

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lematizer= WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text=[lematizer.lemmatize(i) for i in text]
    return lem_text

In [None]:
df1['Title1']= df1['Title1'].apply(lambda x: word_lemmatizer(x))

In [None]:
df1['Body1']= df1['Body1'].apply(lambda x: word_lemmatizer(x))

#### STOPWORD REMOVAL

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

In [None]:
df1['Body1']=df1['Body1'].apply(lambda x: " ".join([w for w in x if w not in stopwords]))
df1['Body1'].head(5)

In [None]:
df1['Title1']=df1['Title1'].apply(lambda x: " ".join([w for w in x if w not in stopwords]))
df1['Title1'].head(5)

### FINAL DATAFRAME AFTER TEXT CLEANING

In [None]:
df1= df1.drop(['Title', 'Body','Score'],axis=1)
df1.head()

### TF-IDF VECTORIZATION

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df1['Title1']= df1['Title1'].astype(str)
vectorizer = TfidfVectorizer()
X1 = vectorizer.fit_transform(df1['Title1'].str.lower())

In [None]:
df1['Body1']= df1['Body1'].astype(str)
vectorizer = TfidfVectorizer()
X2 = vectorizer.fit_transform(df1['Body1'].str.lower())

### CHANGING CATEGORICAL VARIABLES INTO NUMERIC

In [None]:
# from sklearn.preprocessing import LabelEncoder
# le= LabelEncoder() 
# df1['Tags']= le.fit_transform(df1['Tags'])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
multilabel= MultiLabelBinarizer()

In [None]:
y= multilabel.fit_transform(df1['Tag'])

In [None]:
df1['Tag']

In [None]:
y

### SPLITTING THE DATASET INTO TRAIN AND TEST SET

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X2, y, test_size=0.30, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

### APPLYING DIFFERENT ALGORITHMS

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
lr = LogisticRegression(C=10)

# Creating the model on Training Data

from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(lr)
model=clf.fit(x_train,y_train)
prediction=model.predict(x_test)
# Printing the Overall Accuracy of the model
from sklearn import metrics
print(metrics.multilabel_confusion_matrix(y_test, prediction))
F1_Score=metrics.classification_report(y_test, prediction).split()[-2]
print('Accuracy of the model:', F1_Score)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(sgd)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.multilabel_confusion_matrix(y_test, y_pred))
F1_Score=metrics.classification_report(y_test, y_pred).split()[-2]
print('Accuracy of the model:', F1_Score)

In [None]:
from sklearn.svm import LinearSVC
sv= LinearSVC()
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(sv)
clf.fit(x_train,y_train)
y_pred= model.predict(x_test)
F1_Score=metrics.classification_report(y_test, prediction).split()[-2]
print('Accuracy of the model:', F1_Score)


Implementation of model

In [None]:
x = [ 'how to write ml code in python and java i have data but do not know how to do it']
xt = vectorizer.transform(x)
clf.predict(xt)
multilabel.inverse_transform(clf.predict(xt))