dataset link on kaggle : https://www.kaggle.com/datasets/stackoverflow/stacksample

In [3]:
import numpy as np 
import pandas as pd 
import os
from bs4 import BeautifulSoup
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

**Reading Data**

In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stacksample/Answers.csv
/kaggle/input/stacksample/Questions.csv
/kaggle/input/stacksample/Tags.csv


In [5]:
data_path = "/kaggle/input/stacksample/"

In [6]:
questions = pd.read_csv(data_path + "Questions.csv", encoding='latin1')
print(questions.shape)

(1264216, 7)


In [7]:
questions.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
5,330,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
6,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...
7,580,91.0,2008-08-02T23:30:59Z,,21,Deploying SQL Server Databases from Test to Live,<p>I wonder how you guys manage deployment of ...
8,650,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...
9,810,233.0,2008-08-03T20:35:01Z,,9,Visual Studio Setup Project - Per User Registr...,<p>I'm trying to maintain a Setup Project in <...


In [None]:
#answers = pd.read_csv(data_path + "Answers.csv", encoding='latin1')
#print(answers.shape)

In [None]:
#answers.head(10)

In [8]:
tags = pd.read_csv(data_path + "Tags.csv", encoding='latin1')
print(tags.shape)

(3750994, 2)


In [9]:
tags.head(10)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
5,90,branch
6,90,branching-and-merging
7,120,sql
8,120,asp.net
9,120,sitemap


In [None]:
questions.info()

In [None]:
print(questions.notnull().sum()/questions.shape[0])

In [None]:
#answers.info()

In [None]:
#print(answers.notnull().sum()/answers.shape[0])

In [None]:
tags.info()

In [None]:
print(tags.notnull().sum()/tags.shape[0])

**preprocessing**

Nulls

In [10]:
tags=tags.dropna()

Number of unique tags

In [11]:
len(tags['Tag'].unique())

37034

In [12]:
sorted_tags = tags.groupby('Tag').size().reset_index(name='Count') \
                 .sort_values('Count', ascending=False)

In [13]:
print(sorted_tags.tail(30000))

                 Tag  Count
3409   bidirectional     34
28663     selectlist     34
3360     berkeley-db     34
19287            mdf     34
12652          gmock     34
...              ...    ...
31989           tbcd      1
10167     evil-dicom      1
10169            evo      1
31986           tbar      1
24456         pjsua2      1

[30000 rows x 2 columns]


In [14]:
top_tags = sorted_tags.head(60)['Tag']
filtered_tags = tags[tags['Tag'].isin(top_tags)]

In [15]:
print(filtered_tags.head())

     Id        Tag
7   120        sql
8   120    asp.net
10  180  algorithm
14  260         c#
15  260       .net


1057478

In [16]:
print(len(filtered_tags['Id'].unique()))

1009697


wrong approach

In [None]:
#string_tags_df = tags.groupby("Id", as_index=False)["Tag"].agg(" ".join)

In [None]:
#len(string_tags_df)

In [None]:
#string_tags_df.head(5)

In [None]:
#len(string_tags_df['Tag'].unique())

In [None]:
#full_tag_counts = string_tags_df['Tag'].value_counts().reset_index(name='Count')
#full_tag_counts.columns = ['Tag', 'Count']
#sorted_full_tag_counts = full_tag_counts.sort_values('Count', ascending=False)

In [None]:
#print(sorted_full_tag_counts.tail(650000))

In [None]:
#count_other_tags = sorted_full_tag_counts.iloc[1000:]['Count'].sum()
#count_other_tags

In [None]:
#answer_counts = answers.groupby("ParentId").size().to_dict()
#value_counts = Counter(answer_counts.values())
#sorted_counts = dict(sorted(value_counts.items()))
#print(sorted_counts)

**not include answers with zero or negative score**

In [None]:
#answer_counts = answers[answers['Score']>0].groupby("ParentId").size().to_dict()
#value_counts = Counter(answer_counts.values())
#sorted_counts = dict(sorted(value_counts.items()))
#print(sorted_counts)

add new column is the questions has link or now and the question after remove html tags

In [17]:
def remove_html_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    has_link = soup.find('a', href=True) is not None 
    text = soup.get_text(separator=' ', strip=True)
    return has_link, text

hyperparameters to play with

In [18]:
questions = questions[questions['Score'] > -2]
len(questions)


1229546

In [None]:
#answer_counts = answers[answers['Score']>0]

***feature engineering***

In [19]:
questions['has_link'], questions['clean_text'] = zip(*questions['Body'].apply(remove_html_tags))

In [22]:
questions['combined_text'] = questions['Title'] + "\n" + questions['clean_text']

In [23]:
questions.drop(columns=['Body','Title'], inplace=True) # can make it questions if we will use it 

In [None]:
#answers['has_link'], answers['clean_text'] = zip(*answers['Body'].apply(remove_html_tags))

In [None]:
#answers.drop(columns=['Body'], inplace=True) 

In [None]:
questions['text_length'] = questions['clean_text'].apply(len)

In [None]:
questions['word_count'] = questions['clean_text'].apply(lambda x: len(x.split()))

we can add new feature by add column of numbers of answers 

In [None]:
#answers_grouped = answers.groupby('ParentId')['Body'].apply(lambda x: ' '.join(x)).reset_index()
#answers_grouped.columns = ['Id', 'AllAnswers']

In [None]:
#questions = questions.merge(answers_grouped, on='Id', how='left')

In [24]:
print(questions.notnull().sum()/questions.shape[0])

Id               1.000000
OwnerUserId      0.988291
CreationDate     1.000000
ClosedDate       0.036786
Score            1.000000
has_link         1.000000
clean_text       1.000000
combined_text    1.000000
dtype: float64


In [None]:
#print(answers.notnull().sum()/answers.shape[0])

In [25]:
grouped_tags = filtered_tags.groupby('Id')['Tag'].apply(list).reset_index(name='tags')


In [26]:
grouped_tags.head()

Unnamed: 0,Id,tags
0,120,"[sql, asp.net]"
1,180,[algorithm]
2,260,"[c#, .net]"
3,330,[c++]
4,470,[.net]


In [27]:
merged_df = pd.merge(questions, grouped_tags, on='Id', how='inner')


In [28]:
merged_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,has_link,clean_text,combined_text,tags
0,120,83.0,2008-08-01T15:50:08Z,,21,False,Has anyone got experience creating SQL-based A...,ASP.NET Site Maps\nHas anyone got experience c...,"[sql, asp.net]"
1,180,2089740.0,2008-08-01T18:42:19Z,,53,False,This is something I've pseudo-solved many time...,Function for creating color wheels\nThis is so...,[algorithm]
2,260,91.0,2008-08-01T23:22:08Z,,49,True,I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,"[c#, .net]"
3,330,63.0,2008-08-02T02:51:36Z,,29,False,I am working on a collection of classes used f...,Should I use nested classes in this case?\nI a...,[c++]
4,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,False,I've been writing a few web services for a .ne...,Homegrown consumption of web services\nI've be...,[.net]


In [29]:
merged_df.columns

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'has_link',
       'clean_text', 'combined_text', 'tags'],
      dtype='object')

In [30]:
merged_df.drop(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'has_link',
       ],axis=True)

Unnamed: 0,combined_text,tags
0,ASP.NET Site Maps\nHas anyone got experience c...,"[sql, asp.net]"
1,Function for creating color wheels\nThis is so...,[algorithm]
2,Adding scripting functionality to .NET applica...,"[c#, .net]"
3,Should I use nested classes in this case?\nI a...,[c++]
4,Homegrown consumption of web services\nI've be...,[.net]
...,...,...
978064,"YouTube iFrame API: no ready call, no error ca...",[javascript]
978065,How to execute multiline python code from a ba...,"[python, bash]"
978066,URL routing in PHP (MVC)\nI am building a cust...,[php]
978067,Obfuscating code in android studio\nUnder mini...,[android]


In [32]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()


In [33]:
y = mlb.fit_transform(merged_df['tags'])

In [38]:
binary_df = pd.DataFrame(y, columns=mlb.classes_)

In [39]:
binary_df

Unnamed: 0,.net,ajax,algorithm,android,angularjs,apache,arrays,asp.net,asp.net-mvc,bash,...,swift,twitter-bootstrap,vb.net,vba,windows,winforms,wordpress,wpf,xcode,xml
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
978065,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
978066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
978067,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


***tf_idf***

In [40]:
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

In [42]:
tfidf = TfidfVectorizer(
    max_features=3000,# could change but takecare of crash
    stop_words='english',
    ngram_range=(1, 1),# you can make it 2 but could get crash due to ram
    preprocessor=remove_punctuation #
)

In [None]:
X = tfidf.fit_transform(merged_df['clean_text'])#u can add other feature here 

In [51]:
X = tfidf.fit_transform(merged_df['combined_text'])#u can add other feature here 

donot forget label encoding

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, binary_df, test_size=0.2, random_state=42
)


models , classifier   

In [60]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler(with_mean=False)
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression(solver='saga', max_iter=300))
clf.fit(X_train, y_train)


In [73]:
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy*100}")
print(f"Test Accuracy: {accuracy*100}")

Train Accuracy: 40.01444172508323.
Test Accuracy: 39.132679665054646
