
**Importing the required modules**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import random
from bs4 import BeautifulSoup
import time
import warnings; warnings.simplefilter('ignore')
import re
import string
from collections import Counter
# from google.colab import files

from tqdm import tqdm
import json


import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import gensim

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,jaccard_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler #fixed import
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer, TFBertForSequenceClassification


nltk.download('averaged_perceptron_tagger') # required for parts of speech
nltk.download('wordnet') # required for parts of speech
nltk.download('stopwords') #download the stopwords


**combine the questions and tags table**

In [None]:
###################################### Code for inner combine ######################################
df1 = pd.read_csv('Questions.csv', encoding='ISO-8859-1')
df2 = pd.read_csv('Tags.csv', encoding='ISO-8859-1')

# combined dataframe of questiontags
df3 = df1.set_index('Id').join(df2.set_index('Id'))
df3=shuffle(df3)
df3 = df3.reset_index()



###################################### Code for preparing the train_data from the total data ######################################


#only taking questions with score greater than or equal to 3

df4=df3[df3["Score"]>=1]

#generating the list of all unique question ids and also for ranking tags based on their popularity

unique_ids=Counter(df4["Id"])
q=sorted(zip(Counter(df4["Tag"]).values(),Counter(df4["Tag"]).keys()),reverse=True)
rank={}
for j in range(len(q)):
    rank[q[j][1]]=j+1
    
    
keys=list(unique_ids.keys())

# iterating over each unique question id and assigning only one tag to that based on the ranking of the tag
Tags=Counter(df4["Tag"])
Final_dataframe={"Body":[],"Title":[],"Tags":[]}
for key in keys:
    current_df=df4[df4["Id"]==key]
    selected_tag=-1
    selected_tag_rank=-1
    Body=list(current_df["Body"])[0]
    Title=list(current_df["Title"])[0]
    
    for tag in current_df["Tag"]:
        if rank[tag]>selected_tag_rank:
            selected_tag=tag
            selected_tag_rank=rank[tag]
    Final_dataframe["Body"].append(Body)
    Final_dataframe["Title"].append(Title)
    Final_dataframe["Tags"].append(selected_tag)

df5=pd.DataFrame(Final_dataframe)

#concatenating the title and the body columns into the Questions column
df5["Questions"]=df5["Title"]+" "+df5["Body"]

df5.drop(["Body","Title"],axis=1,inplace=True)




In [None]:
train_data=df5

In [None]:
train_data=pd.read_csv("New_data.csv")


In [None]:
train_data.head()

**Machine learning part (Preprocessing and exploration)**

---



In [None]:
#dropping rows with nan values in Tags column

train_data.dropna(subset=['Tags'], inplace=True)
train_data=train_data.reset_index(drop=True)



In [None]:
#data cleaning 
substrings_to_replace = ['</p>', '<p>','\n','<pre>','</pre>','<a href=" ">']
for substring in substrings_to_replace:
    train_data['Questions'] = train_data['Questions'].str.replace(substring, ' ')


train_data['Questions']=train_data['Questions'].apply(lambda x : re.sub(r'<code>.*?</code>', ' ', x, flags=re.DOTALL)) #removing any urls

train_data['Questions']=train_data['Questions'].apply(lambda x : re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)' , ' ' , x)) #removing any urls
train_data['Questions'] = train_data['Questions'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3])) #removes small length words (len<3)

train_data['Questions']=train_data['Questions'].apply(lambda x : x.lower()) #coverting to lowercase


In [None]:
#removing the stop words and the punctuations from test and train dataset

punctuations = string.punctuation

stop_words=set(nltk.corpus.stopwords.words('english'))
train_data['Questions']=train_data['Questions'].apply(lambda x:' '.join([w for w in x.split() if w not in stop_words]))
train_data['Questions']=train_data['Questions'].apply(lambda x:' '.join([''.join([char for char in w if char not in punctuations]) for w in x.split()]))


In [None]:
#dropping the rows with empty values of Question after filtering

for j in range(len(train_data['Questions'])):
  if len(train_data['Questions'][j])==0:
     train_data.drop(j,inplace=True)
train_data=train_data.reset_index(drop=True)    

In [None]:
#using lemmatization on the questions of the train and test dataset
def lemmatization(text):
    pos_dict = {
        'N': 'n',  # Noun
        'V': 'v',  # Verb
        'R': 'r',  # Adverb
        'J': 'a'   # Adjective
    }
    pos_tags = pos_tag(text)
    lemmatizer = WordNetLemmatizer()
    lemma=[]
    for word, tag in pos_tags:
        if (tag[0].upper() not in pos_dict.keys()):
          pos='n'
        else:
          pos= pos_dict[tag[0].upper()]
        lemma.append(lemmatizer.lemmatize(word,pos=pos))
    return lemma

train_data['Questions']=train_data['Questions'].apply(lambda x : lemmatization(x.split()))
train_data["Questions"]=train_data["Questions"].apply(lambda x : " ".join(x))



In [None]:
#Analysing certain parameters about the train data

def tokenize_question(text):
    return text.split()

questions = train_data['Questions'].tolist()

print('The total number of words in the data is: ', sum([len(text.split()) for text in questions]))



question_vect = CountVectorizer(tokenizer=tokenize_question)
questions=question_vect.fit_transform(questions)

print('The number of words in the vocabulary is: ', len(question_vect.vocabulary_))





In [None]:
train_data.shape

In [None]:
#saving the output after preprocessing

# train_data.to_csv("Data_570000_preprocessed.csv",index=False)
train_data.to_csv("Data_preprocessed.csv",index=False)


In [None]:
train_data=pd.read_csv("Data_570000_preprocessed.csv") #training dataset
test_data=pd.read_csv("Data_preprocessed.csv") #testing dataset



In [None]:
train_data.head()

In [None]:
#exploration about the tags column and selecting the top tags 


#define the top tags count
tags_top=5

tags = train_data['Tags'].tolist()
tags_Freq=Counter(tags)

print("Total number of unique tags : ",len(tags_Freq.keys()))
tags2=zip(tags_Freq.keys(),tags_Freq.values())

tags2=sorted(tags2,key=lambda x:x[1],reverse=True)
total_frequency=sum(tags_Freq.values())



top_tags=[tags2[j][0] for j in range(tags_top)]
top_tags_values=[tags_Freq[tags2[j][0]] for j in range(tags_top)]


# plt.bar(top_300_tags, top_300_tags_values, width=0.5, color='r')
# plt.xlabel('Tags')
# plt.ylabel('Frequencies')
# plt.title('Top 20 Tags and Frequencies')
# plt.xticks(rotation=90)  # Rotate the x-axis labels for better visibility
# plt.tight_layout()  # Adjust the layout to prevent label cutoff
# plt.show()


In [None]:
n_each=tags_Freq[top_300_tags[-1]]


In [None]:
dataframe=pd.DataFrame()
n_each=tags_Freq[top_300_tags[-1]]
for tag in top_300_tags:
    current_df=train_data[train_data["Tags"]==tag]
    sampled_df = current_df.sample(n=n_each, random_state=42)
    dataframe=dataframe.append(sampled_df, ignore_index=True)
train_data=shuffle(dataframe).reset_index(drop=True)

In [None]:
#performing the encoding of the tags column

def filter_data_by_most_common_tags(data, common_tags):
    filtered_data = data[data["Tags"].isin(common_tags)]
    return filtered_data


def one_hot(column,data): #count vectorizer feature set
  # Perform one-hot encoding using get_dummies()
  one_hot_encoded = pd.get_dummies(data[column],prefix="tag")

  # Concatenate the one-hot encoded columns with the original dataframe
  data_extended = pd.concat([data, one_hot_encoded], axis=1)
  data_extended.drop(['Tags'],inplace=True,axis=1)

  return data_extended

def label_encoding(data, most_common_tags):#tfidf vectorizer
    v = {}
    for j in range(len(most_common_tags)):
        v[most_common_tags[j]] = j
    data["Tags"] = data["Tags"].apply(lambda x: v[x] if x in v else -1)
    return data


Final_train=filter_data_by_most_common_tags(train_data,top_tags)

Final_test=filter_data_by_most_common_tags(test_data,top_tags)
#performing the one hot encoding of the data 


Final_train=one_hot("Tags",Final_train)

Final_train=Final_train.reset_index(drop=True)

Final_test=one_hot("Tags",Final_test)

Final_test=Final_test.reset_index(drop=True)

#Another Approach (tfidf vectorizer)
# Final_train=label_encoding(Final_train,most_common_tags)
# Final_test=label_encoding(Final_test,most_common_tags)



In [None]:
#tokenizing the questions and storing it in Text_Tokenized column 

train_data['Text_Tokenized'] = train_data['Questions'].str.lower().apply(word_tokenize)


Final_train['Text_Tokenized'] = Final_train['Questions'].str.lower().apply(word_tokenize)

test_data['Text_Tokenized'] = test_data['Questions'].str.lower().apply(word_tokenize)


Final_test['Text_Tokenized'] = Final_test['Questions'].str.lower().apply(word_tokenize)



In [None]:
#training the word2vec model

vector_size_n_w2v = 100

w2v_model = Word2Vec(train_data['Text_Tokenized'],
                     vector_size=vector_size_n_w2v,
                     window=3,
                     min_count=1,
                     sg=0, # 0=CBOW, 1=Skip-gram
                     epochs=5)


In [None]:
#saving the word2vec model

# w2v_model.save("word2vec_model")

# pk.dump(vector_size_n_w2v, open('vector_size_w2v_metric.pkl', 'wb'))


In [None]:

#check which words are similar to another given word
# w2v_model.wv.most_similar(word)

In [None]:
#using pretrained google model 

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True) 


In [None]:
#converting the data into vectors and taking element wise average to describe the sentence 
text_vect_avg = []
# dim=w2v_model['computer'].shape[0]
for idx,text in enumerate(Final_train["Text_Tokenized"]):
    current=np.array([w2v_model.wv[i] for i in text if i in w2v_model.wv])
    if (idx%100==0): print(idx)
    if current.size:
        text_vect_avg.append(current.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n_w2v, dtype=float)) # the same vector size must be used here as for model training
        

    
        
df_Machine_Learning = pd.DataFrame(text_vect_avg)

df_Machine_Learning.columns = ['Element_' + str(i+1) for i in range(0, df_Machine_Learning.shape[1])] #Naming the columns of the word2vec dataset


Final_train= pd.concat([df_Machine_Learning,Final_train.iloc[:,1:-1]], axis=1, sort=False) # concatenating the tags and the word2vec dataset

model_type="Word2vec"



In [None]:
#converting the data into vectors and taking element wise average to describe the sentence 
text_vect_avg = []
# dim=w2v_model['computer'].shape[0]
for idx,text in enumerate(Final_test["Text_Tokenized"]):
    current=np.array([w2v_model.wv[i] for i in text if i in w2v_model.wv])
    if (idx%100==0): print(idx)
    if current.size:
        text_vect_avg.append(current.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n_w2v, dtype=float)) # the same vector size must be used here as for model training
        

    
        
df_Machine_Learning = pd.DataFrame(text_vect_avg)

df_Machine_Learning.columns = ['Element_' + str(i+1) for i in range(0, df_Machine_Learning.shape[1])] #Naming the columns of the word2vec dataset


Final_test= pd.concat([df_Machine_Learning,Final_test.iloc[:,1:-1]], axis=1, sort=False) # concatenating the tags and the word2vec dataset

model_type="Word2vec"


In [None]:
#loading the saved word2vec model

w2v_model_reloaded = Word2Vec.load("word2vec/word2vec_model")
vector_size_n_reloaded = pk.load(open("word2vec/vector_size_w2v_metric.pkl",'rb'))

In [None]:
#tdidf approach

def tokenize_question(text):
    return text.split()

def filter_number_features(name):
  if name[0] in '0123456789' or len(name)<=3:
    return False
  return True
tfidf_vect = TfidfVectorizer(tokenizer=tokenize_question,
                               stop_words='english',
                               min_df=4,
                               max_df=0.5,max_features=1000)

X_train_tfidf = tfidf_vect.fit_transform(Final_train["Questions"]).todense()
# print('The number of words in the vocabulary is: ', len(tfidf_vect.vocabulary_))



#get the feature names
feature_names=tfidf_vect.get_feature_names_out()

# Get the IDF scores
idf_scores = tfidf_vect.idf_

Final_Feature_Set=[]
for idx,feature_name in enumerate(feature_names):
    if filter_number_features(feature_name):
       Final_Feature_Set.append([feature_name,idf_scores[idx]])

Final_Feature_Set=sorted(Final_Feature_Set,key=lambda x :x[1],reverse=True)
Final_Feature_Set=[x[0] for x in Final_Feature_Set]



df_train = pd.DataFrame(X_train_tfidf, columns=tfidf_vect.get_feature_names_out())

df_train=df_train[Final_Feature_Set]

Final_train= pd.concat([df_train,Final_train.iloc[:,1:]], axis=1, sort=False) # concatenating the tags and the tdidf dataset

model_type="Tdidf"

In [None]:
#For embedded layer in keras


# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Final_train["Questions"])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(Final_train["Questions"])

# Pad sequences
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)



In [None]:
X_train=Final_train.iloc[:,:-1*len(top_tags)]

Y_train=Final_train.iloc[:,-1*len(top_tags):]

# X_train=Final_train.iloc[:,:-1]

# Y_train=Final_train.iloc[:,-1:]


In [None]:
# X_train, X_test, Y_train, Y_test = train_test_split(padded_sequences, Y_train, test_size=0.2, random_state=42)

X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train, test_size=0.2, random_state=42)



# Y_train=np.array([np.argmax(Y_train.iloc[idx,:]) for idx in range(Y_train.shape[0])])
# Y_test=np.array([np.argmax(Y_test.iloc[idx,:]) for idx in range(Y_test.shape[0])])

In [None]:
test_x=Final_test.iloc[:,:-1*len(top_tags)]
test_y=Final_test.iloc[:,-1*len(top_tags):]

In [None]:
#using oversampling to deal with class imbalance 


# Print the class distribution before oversampling
print("Class distribution before oversampling:")
unique, counts = np.unique(Y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples")

Apply Random Oversampling
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, Y_train_resampled = oversampler.fit_resample(X_train, Y_train)

# Print the class distribution after oversampling
print("\nClass distribution after oversampling:")
unique, counts = np.unique(Y_train_resampled, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples")

In [None]:


#Storing the results for models 
Results={}

vectorization=["Tdidf","Word2vec"]

Models_tdidf=["Logistic","Svm","Naivebayes","Ann","Gradientboosting"]
Models_word2vec=["Logistic","Svm","Rnn","Gradientboosting"]
Tags=['5','20','100']

for method in vectorization:
    Results[method]={}
    if method=="Tdidf":
        for model in Models_tdidf:
            Results[method][model]={}
            for Tag in Tags:
                Results[method][model][Tag]=-1
                
    else:
        for model in Models_word2vec:
            Results[method][model]={}
            for Tag in Tags:
                Results[method][model][Tag]=-1
        
       
            

    

In [None]:
#function to calculate the evaluation metrics

def eval_metrics(y_test, y_predicted, model,test,print_metrics=True):
    global Results,model_type,top_300_tags
    
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    if test:
         Results[model_type][model][str(len(top_300_tags))]=round(accuracy*100,2)

    if print_metrics:
        print("f1: %.3f - precision: %.3f - recall: %.3f - accuracy: %.3f" % (
            f1, precision, recall, accuracy))
    return f1, precision, recall, accuracy

def print_score(y_pred,y_test):
  print('Jacard score: {}'.format(j_score(y_test, y_pred)))
  print('----')
    
def convert(pred,original,flag):
    if flag=="tdidf":
        original=[np.argmax(original.iloc[idx,:]) for idx in range(original.shape[0])]
    else:
        original=[np.argmax(original[idx,:]) for idx in range(original.shape[0])]

    pred=[np.argmax(pred[idx,:]) for idx in range(pred.shape[0])]
    return original,pred    

In [None]:
with open('Results.json', 'w') as f:
    json.dump(Results, f)     
    
    

In [None]:
f = open('Results.json')
Results=json.load(f) #Loading json file as python dictionary




In [None]:
# Results['Tdidf']['Logistic']['5']=81.48
# Results['Tdidf']['Logistic']['20']=44.65
# Results['Tdidf']['Logistic']['100']=29.31
# Results['Tdidf']['Svm']['5']=84.42
# Results['Tdidf']['Svm']['20']=48.61
# Results['Tdidf']['Svm']['100']=31.52
# Results['Tdidf']['Naivebayes']['5']=72.97
# Results['Tdidf']['Naivebayes']['20']=18.98
# Results['Tdidf']['Naivebayes']['100']=6.39
# Results['Word2vec']['Logistic']['5']=79.47
# Results['Word2vec']['Logistic']['20']=35.13
# Results['Word2vec']['Logistic']['100']=17.29
# Results['Word2vec']['Svm']['5']=90.23
# Results['Word2vec']['Svm']['20']=42.81
# Results['Word2vec']['Svm']['100']=20.48
# Results['Word2vec']['Rnn']['5']=88.43
# Results['Word2vec']['Rnn']['20']=51.58
# Results['Word2vec']['Rnn']['100']=21.65


In [None]:
Y_train.head()

**Machine learning using classification algos**

In [None]:
%%time
# Classifier - Algorithm - Logistic Regression (gridCv)

log_clf = OneVsRestClassifier(LogisticRegression())
param_grid = {
    'estimator__C': [0.1, 1, 10],  # Regularization parameter C
    'estimator__solver': ['liblinear', 'lbfgs']  # Solver algorithm
}

# grid search
grid_search = GridSearchCV(log_clf, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

# The best parameters
best_params = grid_search.best_params_
print(best_params)




In [None]:
# Classifier - Algorithm - Logistic Regression 

log_clf = OneVsRestClassifier(LogisticRegression(C=10, solver='liblinear',verbose=True))

log_clf.fit(X_train, Y_train)

Y_train_predict = log_clf.predict(X_train)



Y_test_predict=log_clf.predict(X_test)

print("LogisticR train Score :")

eval_metrics(Y_train,Y_train_predict,"Logistic",0)

print("LogisticR test Score : ")

eval_metrics(Y_test,Y_test_predict,"Logistic",1)



In [None]:
# Classifier - Algorithm - Gradient Boosting (gridCv)


#the parameter grid
param_grid = {
    'estimator__n_estimators': [100],
    'estimator__learning_rate': [0.1, 0.01, 0.001],
    'estimator__max_depth': [3, 5, 7],
    'estimator__subsample': [0.8, 1.0],
    
}

#  Gradient Boosting classifier
GB = OneVsRestClassifier(XGBClassifier())


# Grid search
grid_search = GridSearchCV(GB, param_grid, cv=5,verbose=2)
grid_search.fit(X_train, Y_train)

# the best parameters
best_params = grid_search.best_params_
print(best_params)

In [None]:
# Classifier - Algorithm - Gradient Boosting 

GB = OneVsRestClassifier(XGBClassifier(learning_rate= 0.1,max_depth=7,n_estimators=300,subsample=0.8))

GB.fit(X_train, Y_train)

Y_train_predict = GB.predict(X_train)



Y_test_predict=GB.predict(X_test)

print("GradientB train Score :")

eval_metrics(Y_train,Y_train_predict,"GradientBoosting",0)

print("GradientB test Score : ")

eval_metrics(Y_test,Y_test_predict,"Gradientboosting",1)


In [None]:
# Classifier - Algorithm - SVM (grid_cv)

# Define the parameter grid
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'estimator__degree': [2, 3, 4],
    'estimator__gamma': ['scale', 'auto']
}

SVM = OneVsRestClassifier(svm.SVC())

# grid search
grid_search = GridSearchCV(SVM, param_grid, cv=5)
grid_search.fit(X_train, Y_train)  

#the best parameters
best_params = grid_search.best_params_
print(best_params)

In [None]:
# Classifier - Algorithm - SVM 


# Train SVM model
SVM = OneVsRestClassifier(svm.SVC(C=10, degree=2, gamma='scale', kernel='rbf',verbose=True))
SVM.fit(X_train,Y_train)



#  predict the labels 
predictions_SVM_train,original_train = convert(SVM.predict(X_train),Y_train,"tdidf")
predictions_SVM_test,original_test = convert(SVM.predict(test_x),test_y,"tdidf")

# Use accuracy_score function to get the accuracy

print("SVM train Score :")

eval_metrics(original_train,predictions_SVM_train,"Svm",0)

print("SVM test Accuracy Score : ")

eval_metrics(original_test,predictions_SVM_test,"Svm",1)

In [None]:
# Classifier - Algorithm - Naive Bayes (gridCv)


param_grid = {
    'estimator__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    'estimator__fit_prior': [True, False]  # Whether to learn class prior probabilities
}

# Create the Naive Bayes classifier
naive = OneVsRestClassifier(naive_bayes.MultinomialNB())

# Perform grid search
grid_search = GridSearchCV(naive, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

# Print the best parameters
print("Best parameters:", grid_search.best_params_)






In [None]:
# Classifier - Algorithm - Naive Bayes 


Naive = OneVsRestClassifier(naive_bayes.MultinomialNB(alpha=0.5,fit_prior=True))

Naive.fit(X_train,Y_train)
# predict the labels on validation dataset

predictions_NB_test = Naive.predict(X_test)
predictions_NB_train = Naive.predict(X_train)
# Use accuracy_score function to get the accuracy
# Use accuracy_score function to get the accuracy

print("Naivebayes train Score :")

eval_metrics(Y_train,predictions_NB_train,"Naivebayes",0)

print("Naivebayes test Score : ")

eval_metrics(Y_test,predictions_NB_test,"Naivebayes",1)

**Neural Network part(ANN)**

In [None]:
#Finding the best parameter set for Rnn model

def build_model(units=64, dropout_rate=0.2,reg_lambda=0.001, activation1='relu',activation2='relu',optimizer='adam'):
    model = Sequential()
    model.add(Dense(units, activation=activation1, input_shape=(X_train.shape[1],),kernel_regularizer=l2(reg_lambda)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(Y_train.shape[1], activation=activation2))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


# KerasClassifier wrapper
model = KerasClassifier(build_fn=build_model)

# parameter grid
param_grid = {
    'epochs': [20,30], 
    'batch_size': [32, 64],  
    'optimizer': ['rmsprop', 'adam'], 
    'reg_lambda': [0.001, 0.01, 0.1], 
    'activation1': ['relu', 'sigmoid'],  
    'activation2': ['relu', 'sigmoid'], 
    'dropout_rate': [0.2, 0.4],  
    'units': [32, 64]  
}


# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,verbose=2)
grid_search.fit(X_train, Y_train)


# best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters: ", best_params)
print("Best Score: ", best_score)


In [None]:
best_params={'activation1': 'relu','activation2': 'sigmoid', 'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 50, 'optimizer': 'adam', 'reg_lambda': 0.001, 'units': 64}

In [None]:
model = Sequential()
model.add(Dense(best_params['units'], activation=best_params['activation1'], input_shape=(X_train.shape[1],),kernel_regularizer=l2(best_params['reg_lambda'])))
model.add(Dropout(best_params['dropout_rate']))
model.add(Dense(Y_train.shape[1], activation=best_params['activation2']))
model.compile(loss='binary_crossentropy', optimizer=best_params['optimizer'], metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, batch_size=best_params['batch_size'], epochs=100)

In [None]:
#training data results

Y_pred_train=model.predict(X_train)
original_train,pred_train=convert(Y_pred_train,Y_train,"tdidf")
eval_metrics(pred_train,original_train,"Ann",0)


In [None]:
#test data results

Y_pred_test=model.predict(X_test)
original_test,pred_test=convert(Y_pred_test,Y_test,"tdidf")
eval_metrics(pred_test,original_test,"Ann",1)

**Neural Network part (RNN)**

In [None]:

# Convert data to numpy arrays
X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
test_x=np.array(test_x).astype(np.float32)
# Reshape the data for LSTM input
X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

x_test_lstm=np.reshape(test_x, (test_x.shape[0], 1, test_x.shape[1]))

In [None]:
#Finding the best parameter set for Rnn model

def build_model(units=64, dropout_rate=0.2,reg_lambda=0.001, activation='relu',optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), kernel_regularizer=l2(reg_lambda)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(Y_train.shape[1], activation=activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


# KerasClassifier wrapper
model = KerasClassifier(build_fn=build_model)

# parameter grid
param_grid = {
    'epochs': [50,100],
    'batch_size':[64],
    'optimizer': ['rmsprop'],
    'reg_lambda':[0.001],
    'activation': ['relu'],
    'dropout_rate': [0.2],
    'units': [64]
}


# GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X_train_lstm, Y_train)


# best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters: ", best_params)
print("Best Score: ", best_score)


In [None]:
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters: ", best_params)
# print("Best Score: ", best_score)

best_params={'activation': 'relu', 'batch_size': 64, 'dropout_rate': 0.2, 'epochs': 50, 'optimizer': 'rmsprop', 'reg_lambda': 0.001, 'units': 64}

In [None]:
#Training the model based on the best parameters

model = Sequential()
model.add(LSTM(best_params['units'], input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), kernel_regularizer=l2(best_params['reg_lambda'])))
model.add(Dropout(best_params['dropout_rate']))
model.add(Dense(Y_train.shape[1], activation=best_params['activation']))
model.compile(loss='binary_crossentropy', optimizer=best_params['optimizer'], metrics=['accuracy'])

In [None]:
model.fit(X_train_lstm, Y_train, batch_size=best_params['batch_size'], epochs=best_params['epochs'])

**using inbuilt embedding layer**

In [None]:
# Hyperparameters
embedding_dim = 100
hidden_dim = 64
num_classes = len(label_names)
batch_size = 64
num_epochs = 5

# Create the model
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(hidden_dim,kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))

model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, batch_size=batch_size, epochs=num_epochs)



In [None]:
#training data results

Y_pred_train=model.predict(X_train_lstm)
original_train,pred_train=convert(Y_pred_train,Y_train,"tdidf")
eval_metrics(pred_train,original_train,"Rnn",0)


In [None]:
#test data results

Y_pred_test=model.predict(x_test_lstm)
original_test,pred_test=convert(Y_pred_test,test_y,"tdidf")
eval_metrics(pred_test,original_test,"Rnn",1)


**Visualization**

In [None]:
x_tdidf=[]
Tags_5_tdidf=[]
Tags_20_tdidf=[]
Tags_100_tdidf=[]

x_word2vec=[]
Tags_5_word2vec=[]
Tags_20_word2vec=[]
Tags_100_word2vec=[]

for vectorization in Results.keys():
    for model in Results[vectorization].keys():
        if vectorization=='Tdidf':
             x_tdidf.append(model)
        else:
             x_word2vec.append(model)
        for Tags in Results[vectorization][model]:
            if Tags=='5' and vectorization=='Tdidf':
                Tags_5_tdidf.append(Results['Tdidf'][model][Tags])
                continue
            if Tags=='20' and vectorization=='Tdidf':
                Tags_20_tdidf.append(Results['Tdidf'][model][Tags])    
                continue
            if Tags=='100' and vectorization=='Tdidf':
                Tags_100_tdidf.append(Results['Tdidf'][model][Tags])
                continue
            if model!='Naivebayes':    
                if Tags=='5':
                    Tags_5_word2vec.append(Results[vectorization][model][Tags])
                if Tags=='20':
                    Tags_20_word2vec.append(Results[vectorization][model][Tags])    
                if Tags=='100':
                    Tags_100_word2vec.append(Results[vectorization][model][Tags])

                    #dealing with svm 100 as not calculated yet                    


vectorization=[]
vectorization.append(x_tdidf)
vectorization.append(x_word2vec)

Tags_Y=[[Tags_5_tdidf,Tags_20_tdidf,Tags_100_tdidf],[Tags_5_word2vec,Tags_20_word2vec,Tags_100_word2vec]]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bar_width = 0.2

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Set the colors for the bars
color1 = '#FF5A5F'  # Red color for 5
color2 = '#00A699'  # Green color for 20
color3 = 'black'    # Black color for 100

for idx, x_labels in enumerate(vectorization):
    x = np.arange(len(x_labels))
    if idx == 0:
        title = "Tdidf"
    else:
        title = "Word2vec"

    # Plot the bars
    bar1 = ax[idx].bar(x - bar_width, Tags_Y[idx][0], width=bar_width, color=color1, label='5')
    bar2 = ax[idx].bar(x, Tags_Y[idx][1], width=bar_width, color=color2, label='20')
    bar3 = ax[idx].bar(x + bar_width, Tags_Y[idx][2], width=bar_width, color=color3, label='100')

    # Set the x-axis tick labels
    ax[idx].set_xticks(x)
    ax[idx].set_xticklabels(x_labels)

    # Set the axis labels and title
    ax[idx].set_xlabel("Model Name", fontsize=10)
    ax[idx].set_ylabel("Test Accuracy (%)", fontsize=10)
    ax[idx].set_title(title, fontsize=14,y=1.03)
    ax[idx].xaxis.set_label_coords(0.5, -0.1)

    # Set the legend with a heading and increased font size
    ax[idx].legend(title='Top Tags', frameon=False, fontsize=10, bbox_to_anchor=(1.15, 1), loc='upper right')

    # Remove spines and ticks on the top and right sides
    ax[idx].spines['top'].set_visible(False)
    ax[idx].spines['right'].set_visible(False)
    ax[idx].tick_params(right=False, top=False)

    # Add grid lines
    ax[idx].grid(axis='y', linestyle='--', alpha=0.5)

    # Add tooltip at the top of each bar
    for rect in bar1 + bar2 + bar3:
        height = rect.get_height()
        ax[idx].annotate(f'{height}', xy=(rect.get_x() + rect.get_width() / 2, height),
                         xytext=(6, 4), textcoords="offset points",
                         ha='center', va='bottom', fontsize=10)

    # Adjust the bottom margin to add space between xlabel and xticks
    ax[idx].tick_params(axis='x', which='major', pad=4)

# Adjust the layout and padding
fig.tight_layout(pad=-1)
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.3,
                    hspace=0.4)
# Show the plot
plt.show()


Web Scraping Part to fetch new set of questions

In [None]:
#scraping new set of questions
import pandas as pd
Questions=[] #array to store the questions
Questions=pd.read_csv("New_data.csv")


Questions=list(zip(Questions['Questions'],Questions["Tags"]))

In [None]:
len(Questions)

In [None]:


start=time.time()
for pageNumber in range(6000,10000):
     
    response=requests.get("https://stackoverflow.com/questions",params={"tab":"newest","page":pageNumber,"pagesize":50,'sort':"MostVotes"})

    data=BeautifulSoup(response.text,'html.parser' ) #parsing the html text

    req_data=data.find(id="questions")

    new_data=req_data.find_all("h3", class_="s-post-summary--content-title") # the tag that contains the question info

    print(time.time()-start,len(Questions),pageNumber) #checking the current stats of operation

    for element in new_data:
        link=element.a.attrs['href']

        response=requests.get(f"https://stackoverflow.com/{link}")  #fetching the content related to the question
        time.sleep(0.8)

        data_questionwise=BeautifulSoup(response.text,'html.parser')
        try:


            question_wise_title=data_questionwise.find("div",id="question-header").h1.a.string #title

            question_wise_desc=data_questionwise.find("div",class_="s-prose js-post-body") #description
        
            question_wise_score=int(data_questionwise.find("div",class_="js-vote-count flex--item d-flex fd-column ai-center fc-theme-body-font fw-bold fs-subheading py4").string.replace(" ", "")) #score 
        
        except:
             continue
        
        if(question_wise_score<5):
            break
        
        all_paragraphs=question_wise_desc.find_all("p")

        total_description_question_wise=""

        for para in all_paragraphs:
            total_description_question_wise+=para.text

        Final_content=question_wise_title+""+total_description_question_wise  #concatenating the title and description

        tag_question_wise=data_questionwise.find("ul",class_="ml0 list-ls-none js-post-tag-list-wrapper d-inline").li.text #tag

        Questions.append([Final_content,tag_question_wise])
    


print(len(Questions))




#479



In [None]:
df=pd.DataFrame(Questions,columns=["Questions","Tags"])
df.to_csv("New_data.csv",index=False)