<a href="https://colab.research.google.com/github/ad451/Stackoverflow_tag_generator/blob/main/StackOverflow_Tag_Generator_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Importing the required modules**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import random
from bs4 import BeautifulSoup
import time
import warnings; warnings.simplefilter('ignore')
import re
import string
from collections import Counter
from google.colab import files


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag


from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,jaccard_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


nltk.download('averaged_perceptron_tagger') # required for parts of speech
nltk.download('wordnet') # required for parts of speech
nltk.download('stopwords') #download the stopwords


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Web Scraping the current set of questions for testing the model**

In [None]:


Questions=[] #array to store the questions

for pageNumber in range(1,401):
    response=requests.get("https://stackoverflow.com/questions",params={"tab":"newest","page":pageNumber,"pagesize":50})

    data=BeautifulSoup(response.text,'html.parser' ) #parsing the html text

    req_data=data.find(id="questions")

    new_data=req_data.find_all("h3", class_="s-post-summary--content-title") # the tag that contains the question info



    for element in new_data:
        link=element.a.attrs['href']

        response=requests.get(f"https://stackoverflow.com/{link}")  #fetching the content related to the question

        data_questionwise=BeautifulSoup(response.text,'html.parser')

        question_wise_title=data_questionwise.find("div",id="question-header").h1.a.string #title

        question_wise_desc=data_questionwise.find("div",class_="s-prose js-post-body") #description

        all_paragraphs=question_wise_desc.find_all("p")

        total_description_question_wise=""

        for para in all_paragraphs:
            total_description_question_wise+=para.text

        Final_content=question_wise_title+""+total_description_question_wise  #concatenating the title and description

        tag_question_wise=data_questionwise.find("ul",class_="ml0 list-ls-none js-post-tag-list-wrapper d-inline").li.text #tag

        Questions.append([Final_content,tag_question_wise])

    print(pageNumber)   #checking which page questions have been fetched yet

print(len(Questions))








**combine the input questions and tags table**

In [None]:
###################################### Code for inner combine ######################################
df1 = pd.read_csv('Questions.csv', encoding='ISO-8859-1')

df2 = pd.read_csv('Tags.csv', encoding='ISO-8859-1')

# combined dataframe of questiontags
df3 = df1.set_index('Id').join(df2.set_index('Id'))

df3=shuffle(df3)

df3 = df3.reset_index()



###################################### Code for preparing the train_data from the total data ######################################


#only taking questions with score greater than or equal to 3

df4=df3[df3["Score"]>=3]


#generating the list of all unique question ids and also for ranking tags based on their popularity

unique_ids=Counter(df4["Id"])

q=sorted(zip(Counter(df4["Tag"]).values(),Counter(df7["Tag"]).keys()),reverse=True)

rank={}

for j in range(len(q)):
    rank[q[j][1]]=j+1


keys=list(unique_ids.keys())

# iterating over each unique question id and assigning only one tag to that based on the ranking of the tag

Tags=Counter(df4["Tag"])

Final_dataframe={"Body":[],"Title":[],"Tags":[]}

for key in keys:

    current_df=df4[df4["Id"]==key]
    selected_tag=-1
    selected_tag_rank=-1
    Body=list(current_df["Body"])[0]
    Title=list(current_df["Title"])[0]

    for tag in current_df["Tag"]:
        if rank[tag]>selected_tag_rank:
            selected_tag=tag
            selected_tag_rank=rank[tag]

    Final_dataframe["Body"].append(Body)
    Final_dataframe["Title"].append(Title)
    Final_dataframe["Tags"].append(selected_tag)

df5=pd.DataFrame(Final_dataframe)

#concatenating the title and the body columns into the Questions column
df5["Questions"]=df5["Title"]+" "+df5["Body"]

df5.drop(["Body","Title"],axis=1,inplace=True)


train_data=df5.copy()

**Machine learning part (Preprocessing and exploration)**

---



In [None]:
train_data=pd.read_csv("questiontags_train.csv")
test_data=pd.read_csv("questiontags_test.csv")

In [None]:
#rename the columns of the train dataset

train_data.drop(train_data.columns[0],axis=1,inplace=True)


train_data.rename(columns={"Title":"Questions","Tag":"Tags"},inplace=True)


In [None]:
#checking for NUll values in the columns

train_data.dropna(subset=['Tags'], inplace=True)


In [None]:
#combining the webscraped data and the train_data

train_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
train_data=shuffle(train_data)
train_data=train_data.reset_index(drop=True)

In [None]:
train_data.head()

In [None]:
#data cleaning
substrings_to_replace = ['</p>', '<p>','\n','<pre>','</pre>','<a href=" ">']
for substring in substrings_to_replace:
    train_data['Questions'] = train_data['Questions'].str.replace(substring, ' ')


train_data['Questions']=train_data['Questions'].apply(lambda x : re.sub(r'<code>.*?</code>', ' ', x, flags=re.DOTALL)) #removing any urls

train_data['Questions']=train_data['Questions'].apply(lambda x : re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)' , ' ' , x)) #removing any urls
train_data['Questions'] = train_data['Questions'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3])) #removes small length words (len<3)

train_data['Questions']=train_data['Questions'].apply(lambda x : x.lower()) #coverting to lowercase


In [None]:
#removing the stop words and the punctuations from test and train dataset

punctuations = string.punctuation

stop_words=set(nltk.corpus.stopwords.words('english'))
train_data['Questions']=train_data['Questions'].apply(lambda x:' '.join([w for w in x.split() if w not in stop_words]))
train_data['Questions']=train_data['Questions'].apply(lambda x:' '.join([''.join([char for char in w if char not in punctuations]) for w in x.split()]))


In [None]:
#dropping the rows with empty values of Question after filtering

for j in range(len(train_data['Questions'])):
  if len(train_data['Questions'][j])==0:
     train_data.drop(j,inplace=True)
     print("yes")
train_data=train_data.reset_index(drop=True)

In [None]:
#using lemmatization on the questions of the train and test dataset
def lemmatization(text):
    pos_dict = {
        'N': 'n',  # Noun
        'V': 'v',  # Verb
        'R': 'r',  # Adverb
        'J': 'a'   # Adjective
    }
    pos_tags = pos_tag(text)
    lemmatizer = WordNetLemmatizer()
    lemma=[]
    for word, tag in pos_tags:
        if (tag[0].upper() not in pos_dict.keys()):
          pos='n'
        else:
          pos= pos_dict[tag[0].upper()]
        lemma.append(lemmatizer.lemmatize(word,pos=pos))
    return lemma

train_data['Questions']=train_data['Questions'].apply(lambda x : lemmatization(x.split()))
train_data["Questions"]=train_data["Questions"].apply(lambda x : " ".join(x))



In [None]:
#Analysing certain parameters about the train data

def tokenize_question(text):
    return text.split()

questions = train_data['Questions'].tolist()

print('The total number of words in the data is: ', sum([len(text.split()) for text in questions]))



question_vect = CountVectorizer(tokenizer=tokenize_question)
questions=question_vect.fit_transform(questions)

print('The number of words in the vocabulary is: ', len(question_vect.vocabulary_))





In [None]:
tags = train_data['Tags'].tolist()
tags_Freq=Counter(tags)

print("Total number of unique tags : ",len(tags_Freq.keys()))
tags2=zip(tags_Freq.keys(),tags_Freq.values())

tags2=sorted(tags2,key=lambda x:x[1],reverse=True)
total_frequency=sum(tags_Freq.values())
current=0
idx=0
while current/total_frequency<=0.95:
    current+=tags2[idx][1]
    if(idx>=19):
      break
    idx+=1

print(f"Number of tags that account for {round(current*100/total_frequency,2)}% of all tags appearance : ",idx+1)

top_300_tags=[tags2[j][0] for j in range(20)]
top_300_tags_values=[tags_Freq[tags2[j][0]] for j in range(20)]

# plt.bar(top_300_tags, top_300_tags_values, width=0.5, color='r')
# plt.xlabel('Tags')
# plt.ylabel('Frequencies')
# plt.title('Top 20 Tags and Frequencies')
# plt.xticks(rotation=90)  # Rotate the x-axis labels for better visibility
# plt.tight_layout()  # Adjust the layout to prevent label cutoff
# plt.show()


In [None]:

train_new=train_data.copy()


In [None]:


def tokenize_question(text):
    return text.split()

def filter_number_features(name):
  if name[0] in '0123456789' or len(name)<=3:
    return False
  return True
tfidf_vect = TfidfVectorizer(tokenizer=tokenize_question,
                               stop_words='english',
                               min_df=4,
                               max_df=0.5,max_features=1000)

X_train_tfidf = tfidf_vect.fit_transform(train_new["Questions"]).todense()
# print('The number of words in the vocabulary is: ', len(tfidf_vect.vocabulary_))



#get the feature names
feature_names=tfidf_vect.get_feature_names_out()

# Get the IDF scores
idf_scores = tfidf_vect.idf_

Final_Feature_Set=[]
for idx,feature_name in enumerate(feature_names):
    if filter_number_features(feature_name):
       Final_Feature_Set.append([feature_name,idf_scores[idx]])

Final_Feature_Set=sorted(Final_Feature_Set,key=lambda x :x[1],reverse=True)
Final_Feature_Set=[x[0] for x in Final_Feature_Set]


#Another approach (tfidf vectorization as feature set)

df_train = pd.DataFrame(X_train_tfidf, columns=tfidf_vect.get_feature_names_out())

df_train=df_train[Final_Feature_Set]

df_train["Tags"]=train_new["Tags"]


In [None]:
def tokenize_question(text):
      return text.split()

def CountVectorizer_Custom(data,Features):

  questions = data['Questions'].tolist()
  tags = data['Tags'].tolist()

  question_vect = CountVectorizer(tokenizer=tokenize_question,binary=True,vocabulary=Features)
  questions=question_vect.fit_transform(questions)

  df_train = pd.DataFrame(questions.toarray(), columns=question_vect.get_feature_names_out())
  df_train["Tags"]=tags
  return df_train

#using count vectorizer as feature set

train_new=CountVectorizer_Custom(train_new,Final_Feature_Set)









In [None]:
def filter_data_by_most_common_tags(data, common_tags):
    filtered_data = data[data["Tags"].isin(common_tags)]
    return filtered_data


def one_hot(column,data): #count vectorizer feature set
  # Perform one-hot encoding using get_dummies()
  one_hot_encoded = pd.get_dummies(data[column],prefix="tag")

  # Concatenate the one-hot encoded columns with the original dataframe
  data_extended = pd.concat([data, one_hot_encoded], axis=1)
  data_extended.drop(['Tags'],inplace=True,axis=1)

  return data_extended

def label_encoding(data, most_common_tags):#tfidf vectorizer
    v = {}
    for j in range(len(most_common_tags)):
        v[most_common_tags[j]] = j
    data["Tags"] = data["Tags"].apply(lambda x: v[x] if x in v else -1)
    return data


Final_train=filter_data_by_most_common_tags(df_train,top_300_tags)

#performing the one hot encoding of the data


Final_train=one_hot("Tags",Final_train)





#Another Approach (tfidf vectorizer)
# Final_train=label_encoding(Final_train,most_common_tags)
# Final_test=label_encoding(Final_test,most_common_tags)



In [None]:
Final_train.to_csv("Final_train.csv", index=False)


In [None]:
Final_train=pd.read_csv("Final_train.csv")


In [None]:
# files.download('Final_train.csv')

In [None]:
X_train=Final_train.iloc[:,:-1*len(top_300_tags)]

Y_train=Final_train.iloc[:,-1*len(top_300_tags):]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)


In [None]:
#function to calculate the evaluation metrics

def eval_metrics(y_test, y_predicted, print_metrics=True):

    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')

    if print_metrics:
        print("f1: %.3f - precision: %.3f - recall: %.3f - accuracy: %.3f" % (
            f1, precision, recall, accuracy))
    return f1, precision, recall, accuracy
def j_score(y_true, y_pred):
  jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
  return jaccard.mean()*100


def print_score(y_pred,y_test):
  print('Jacard score: {}'.format(j_score(y_test, y_pred)))
  print('----')

def convert(pred,original):
    original=[np.argmax(original.iloc[idx,:]) for idx in range(original.shape[0])]
    pred=[np.argmax(pred[idx,:]) for idx in range(pred.shape[0])]
    return original,pred


**Machine learning using classification algos**

In [None]:
%%time
# Classifier - Algorithm - Logistic Regression

log_clf = OneVsRestClassifier(LogisticRegression())

log_clf.fit(X_train, Y_train)

Y_train_predict = log_clf.predict(X_train)
eval_metrics(Y_train, Y_train_predict)

Y_test_predict=log_clf.predict(X_test)
eval_metrics(Y_test,Y_test_predict)

In [None]:
# Classifier - Algorithm - SVM

SVM = OneVsRestClassifier(svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'))
SVM.fit(X_train,Y_train)

# predict the labels on validation dataset
predictions_SVM_test = SVM.predict(X_test)
predictions_SVM_train = SVM.predict(X_train)

# Use accuracy_score function to get the accuracy
print("SVM train Accuracy Score -> ",accuracy_score(predictions_SVM_train, Y_train)*100)
print("SVM test Accuracy Score -> ",accuracy_score(predictions_SVM_test, Y_test)*100)

In [None]:
# NB classifier
Naive = OneVsRestClassifier(naive_bayes.MultinomialNB())

Naive.fit(X_train,Y_train)
# predict the labels on validation dataset

predictions_NB_test = Naive.predict(X_test)
predictions_NB_train = Naive.predict(X_train)
# Use accuracy_score function to get the accuracy

print("Naive Bayes train Accuracy Score -> ",accuracy_score(predictions_NB_train, Y_train)*100)
print("Naive Bayes test Accuracy Score -> ",accuracy_score(predictions_NB_test, Y_test)*100)

**Neural Network part (RNN)**

In [None]:

# Convert data to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

# Reshape the data for LSTM input
X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], 1, X_train.shape[1]))

In [None]:


# Define and compile the RNN model
def build_model(units=64, dropout_rate=0.2,reg_lambda=0.001, activation='relu',optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), kernel_regularizer=l2(reg_lambda)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(Y_train.shape[1], activation=activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


# Create KerasClassifier wrapper
model = KerasClassifier(build_fn=build_model)

# Define parameter grid
param_grid = {
    'epochs': [30,50],
    'batch_size':[16,32,64],
    'optimizer': ['adam', 'rmsprop'],
    'reg_lambda':[0.001],
    'activation': ['relu'],
    'dropout_rate': [0.2],
    'units': [64]
}


# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X_train_lstm, Y_train)


# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters: ", best_params)
print("Best Score: ", best_score)


In [None]:
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters: ", best_params)
# print("Best Score: ", best_score)

best_params={'activation': 'relu', 'batch_size': 64, 'dropout_rate': 0.2, 'epochs': 50, 'optimizer': 'rmsprop', 'reg_lambda': 0.001, 'units': 64}

In [None]:
model = Sequential()
model.add(LSTM(best_params['units'], input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), kernel_regularizer=l2(best_params['reg_lambda'])))
model.add(Dropout(best_params['dropout_rate']))
model.add(Dense(Y_train.shape[1], activation=best_params['activation']))
model.compile(loss='binary_crossentropy', optimizer=best_params['optimizer'], metrics=['accuracy'])

In [None]:
model.fit(X_train_lstm, Y_train, batch_size=best_params['batch_size'], epochs=best_params['epochs'])

In [None]:
#training data results
Y_pred_train=model.predict(X_train_lstm)
original_train,pred_train=convert(Y_pred_train,Y_train)
eval_metrics(pred_train,original_train)


In [None]:
#test data results

Y_pred_test=model.predict(X_test_lstm)
original_test,pred_test=convert(Y_pred_test,Y_test)
eval_metrics(pred_test,original_test)


**Pytorch Neural Network**

In [None]:
import torch
import torch.nn as nn

input_size = 969
hidden_size = 250
num_layers = 1
output_size = 20
n_epoch = 1000
NN = nn.Sequential(
    nn.RNN(input_size, output_size, num_layers, batch_first=False))

# X_train = X_train.values.tolist()
# Y_train = Y_train.values.tolist()
X_train = torch.FloatTensor(X_train)
Y_train = torch.FloatTensor(Y_train)

optimizer = torch.optim.Adam(NN.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()
for epoch in range(n_epoch):
    # Forward pass
    outputs = NN(X_train)
    loss = criterion(outputs[0], Y_train)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/n_epoch], Loss: {loss.item():.4f}')

In [None]:
outputs[0].size()

In [None]:
type(outputs)

In [None]:
outputs

**CNN code (not optimized)**

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import sklearn
from keras.layers import *
from keras.models import *
from keras.layers.convolutional import Conv1D
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()


# In[2]:


train_data = pd.read_csv(r'C:\Users\dishant\Downloads\train_data.csv')
test_data = pd.read_csv(r'C:\Users\dishant\Downloads\test_data.csv')


# In[3]:


X = train_data['Questions']
y = train_data['Tags']
Y = pd.get_dummies(y)
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X, Y , test_size = 0.20)


# In[4]:


word_index = X_train.str.split()


# In[5]:


from tensorflow.keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
vocab_size = 5000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = 300
padding_type = "post"
trunction_type="post"
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunction_type)
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunction_type)


# In[7]:


import os
import zipfile
with zipfile.ZipFile(r'C:\Users\dishant\Downloads\glove.42B.300d.zip') as zip_ref:
    zip_ref.extractall(r'C:\Users\dishant\Downloads\glove')


# In[8]:


import numpy as np
embeddings_index = {}
f = open(r'C:\Users\dishant\Downloads\glove\glove.42B.300d.txt',encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


# In[9]:


count = 0;
for i, word in word_index.items():
    for j in word:
        count = count+1


# In[10]:


embedding_matrix = np.zeros((count, 300))
c = 0;
for i, word in word_index.items():
    for j in word:
        embedding_vector = embeddings_index.get(j)
        if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
             embedding_matrix[c] = embedding_vector
             c = c+1
        else:
            c = c+1




# In[11]:


embedding_layer = Embedding(input_dim=count,
                            output_dim=max_length,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)


# In[33]:


model = Sequential([
    embedding_layer,
  Conv1D(2000, 5, activation='relu'),
    GlobalMaxPooling1D(),
  Dense(2000, activation='relu'),
  Dense(1927, activation='sigmoid')
])


# In[34]:


model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


# In[35]:


X_train_padded = preprocessing.normalize(X_train_padded)
X_test_padded = preprocessing.normalize(X_test_padded)


# In[36]:


y_train.shape


# In[37]:


history = model.fit(X_train_padded, y_train, epochs=20, validation_data=(X_test_padded, y_test))


# In[ ]:


loss, accuracy = model.evaluate(X_test_padded,y_test)
print('Testing Accuracy is {} '.format(accuracy*100))


# In[ ]:

