dataset link on kaggle : https://www.kaggle.com/datasets/stackoverflow/stacksample

In [1]:
import numpy as np 
import pandas as pd 
import os
from bs4 import BeautifulSoup
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm

**Reading Data**

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stacksample/Answers.csv
/kaggle/input/stacksample/Questions.csv
/kaggle/input/stacksample/Tags.csv


In [3]:
data_path = "/kaggle/input/stacksample/"

In [4]:
questions = pd.read_csv(data_path + "Questions.csv", encoding='latin1')
print(questions.shape)

(1264216, 7)


In [7]:
questions.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
5,330,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
6,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...
7,580,91.0,2008-08-02T23:30:59Z,,21,Deploying SQL Server Databases from Test to Live,<p>I wonder how you guys manage deployment of ...
8,650,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...
9,810,233.0,2008-08-03T20:35:01Z,,9,Visual Studio Setup Project - Per User Registr...,<p>I'm trying to maintain a Setup Project in <...


In [None]:
#answers = pd.read_csv(data_path + "Answers.csv", encoding='latin1')
#print(answers.shape)

In [None]:
#answers.head(10)

In [5]:
tags = pd.read_csv(data_path + "Tags.csv", encoding='latin1')
print(tags.shape)

(3750994, 2)


In [9]:
tags.head(10)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
5,90,branch
6,90,branching-and-merging
7,120,sql
8,120,asp.net
9,120,sitemap


In [None]:
questions.info()

In [None]:
print(questions.notnull().sum()/questions.shape[0])

In [None]:
#answers.info()

In [None]:
#print(answers.notnull().sum()/answers.shape[0])

In [None]:
tags.info()

In [None]:
print(tags.notnull().sum()/tags.shape[0])

**preprocessing**

Nulls

In [6]:
tags=tags.dropna()

Number of unique tags

In [7]:
tqdm.pandas()

In [11]:
len(tags['Tag'].unique())

37034

In [8]:
sorted_tags = tags.groupby('Tag').size().reset_index(name='Count') \
                 .sort_values('Count', ascending=False)

In [13]:
print(sorted_tags.tail(30000))

                 Tag  Count
3409   bidirectional     34
28663     selectlist     34
3360     berkeley-db     34
19287            mdf     34
12652          gmock     34
...              ...    ...
31989           tbcd      1
10167     evil-dicom      1
10169            evo      1
31986           tbar      1
24456         pjsua2      1

[30000 rows x 2 columns]


In [9]:
top_tags = sorted_tags.head(30)['Tag']
filtered_tags = tags[tags['Tag'].isin(top_tags)]

In [15]:
print(filtered_tags.head())

     Id        Tag
7   120        sql
8   120    asp.net
10  180  algorithm
14  260         c#
15  260       .net


1057478

In [16]:
print(len(filtered_tags['Id'].unique()))

1009697


wrong approach

In [None]:
#string_tags_df = tags.groupby("Id", as_index=False)["Tag"].agg(" ".join)

In [None]:
#len(string_tags_df)

In [None]:
#string_tags_df.head(5)

In [None]:
#len(string_tags_df['Tag'].unique())

In [None]:
#full_tag_counts = string_tags_df['Tag'].value_counts().reset_index(name='Count')
#full_tag_counts.columns = ['Tag', 'Count']
#sorted_full_tag_counts = full_tag_counts.sort_values('Count', ascending=False)

In [None]:
#print(sorted_full_tag_counts.tail(650000))

In [None]:
#count_other_tags = sorted_full_tag_counts.iloc[1000:]['Count'].sum()
#count_other_tags

In [None]:
#answer_counts = answers.groupby("ParentId").size().to_dict()
#value_counts = Counter(answer_counts.values())
#sorted_counts = dict(sorted(value_counts.items()))
#print(sorted_counts)

**not include answers with zero or negative score**

In [None]:
#answer_counts = answers[answers['Score']>0].groupby("ParentId").size().to_dict()
#value_counts = Counter(answer_counts.values())
#sorted_counts = dict(sorted(value_counts.items()))
#print(sorted_counts)

add new column is the questions has link or now and the question after remove html tags

In [10]:
def remove_html_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    has_link = soup.find('a', href=True) is not None 
    text = soup.get_text(separator=' ', strip=True)
    return has_link, text

hyperparameters to play with

In [18]:
#questions = questions[questions['Score']]
len(questions)


1229546

In [None]:
#answer_counts = answers[answers['Score']>0]

***feature engineering***

In [11]:
questions['has_link'], questions['clean_text'] = zip(*questions['Body'].progress_apply(remove_html_tags))


100%|██████████| 1264216/1264216 [09:57<00:00, 2117.61it/s]


In [12]:
questions['combined_text'] = questions['Title'] + " " + questions['clean_text']

In [13]:
questions.drop(columns=['Body','Title'], inplace=True) # can make it questions if we will use it 

In [None]:
#answers['has_link'], answers['clean_text'] = zip(*answers['Body'].apply(remove_html_tags))

In [None]:
#answers.drop(columns=['Body'], inplace=True) 

In [None]:
questions['text_length'] = questions['clean_text'].apply(len)

In [None]:
questions['word_count'] = questions['clean_text'].apply(lambda x: len(x.split()))

In [17]:
programming_terms = {
        'javascript', 'java', 'c#', 'php', 'android', 'jquery',
        'python', 'html', 'c++', 'ios', 'mysql', 'css', 'sql',
        'asp.net', 'objective-c', 'ruby-on-rails', '.net', 'c',
        'iphone', 'angularjs', 'arrays', 'sql-server', 'json',
        'ruby', 'r', 'ajax', 'regex', 'xml', 'node.js',
        'asp.net-mvc', 'linux', 'django', 'wpf', 'database', 'swift'
    }

In [18]:
programming_terms = {term.lower() for term in programming_terms}


In [19]:
def clean_preserve_programming_terms(text):
    words = text.lower().split()
    cleaned = [
        word if word in programming_terms else re.sub(r'[^\w\s]', '', word)
        for word in words
    ]
    return ' '.join(cleaned)

In [20]:
questions['cleaned_text'] = questions['combined_text'].progress_apply(clean_preserve_programming_terms)


100%|██████████| 1264216/1264216 [03:38<00:00, 5775.27it/s]


we can add new feature by add column of numbers of answers 

In [None]:
#answers_grouped = answers.groupby('ParentId')['Body'].apply(lambda x: ' '.join(x)).reset_index()
#answers_grouped.columns = ['Id', 'AllAnswers']

In [None]:
#questions = questions.merge(answers_grouped, on='Id', how='left')

In [21]:
print(questions.notnull().sum()/questions.shape[0])

Id               1.000000
OwnerUserId      0.988567
CreationDate     1.000000
ClosedDate       0.044264
Score            1.000000
has_link         1.000000
clean_text       1.000000
combined_text    1.000000
cleaned_text     1.000000
dtype: float64


In [None]:
#print(answers.notnull().sum()/answers.shape[0])

In [22]:
grouped_tags = filtered_tags.groupby('Id')['Tag'].apply(list).reset_index(name='tags')


In [23]:
grouped_tags.head()

Unnamed: 0,Id,tags
0,120,"[sql, asp.net]"
1,260,"[c#, .net]"
2,330,[c++]
3,470,[.net]
4,580,[sql-server]


In [24]:
merged_df = pd.merge(questions, grouped_tags, on='Id', how='inner')


In [28]:
merged_df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,has_link,clean_text,combined_text,tags
0,120,83.0,2008-08-01T15:50:08Z,,21,False,Has anyone got experience creating SQL-based A...,ASP.NET Site Maps\nHas anyone got experience c...,"[sql, asp.net]"
1,180,2089740.0,2008-08-01T18:42:19Z,,53,False,This is something I've pseudo-solved many time...,Function for creating color wheels\nThis is so...,[algorithm]
2,260,91.0,2008-08-01T23:22:08Z,,49,True,I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,"[c#, .net]"
3,330,63.0,2008-08-02T02:51:36Z,,29,False,I am working on a collection of classes used f...,Should I use nested classes in this case?\nI a...,[c++]
4,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,False,I've been writing a few web services for a .ne...,Homegrown consumption of web services\nI've be...,[.net]


In [29]:
merged_df.columns

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'has_link',
       'clean_text', 'combined_text', 'tags'],
      dtype='object')

In [28]:
merged_df.drop(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate',  'has_link',
       ],axis=True)

Unnamed: 0,Score,clean_text,combined_text,cleaned_text,tags
0,21,Has anyone got experience creating SQL-based A...,ASP.NET Site Maps Has anyone got experience cr...,asp.net site maps has anyone got experience cr...,"[sql, asp.net]"
1,49,I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,adding scripting functionality to .net applica...,"[c#, .net]"
2,29,I am working on a collection of classes used f...,Should I use nested classes in this case? I am...,should i use nested classes in this case i am ...,[c++]
3,13,I've been writing a few web services for a .ne...,Homegrown consumption of web services I've bee...,homegrown consumption of web services ive been...,[.net]
4,21,I wonder how you guys manage deployment of a d...,Deploying SQL Server Databases from Test to Li...,deploying sql server databases from test to li...,[sql-server]
...,...,...,...,...,...
911491,0,"I'm trying to detect the ""flash out of date"" e...","YouTube iFrame API: no ready call, no error ca...",youtube iframe api no ready call no error call...,[javascript]
911492,1,I need to extend a shell script (bash). As I a...,How to execute multiline python code from a ba...,how to execute multiline python code from a ba...,[python]
911493,0,I am building a custom MVC project and I have ...,URL routing in PHP (MVC) I am building a custo...,url routing in php mvc i am building a custom ...,[php]
911494,1,Under minifyEnabled I changed from false to tr...,Obfuscating code in android studio Under minif...,obfuscating code in android studio under minif...,[android]


In [30]:
merged_df.drop(['clean_text', 'combined_text'],axis=True)

KeyError: "['clean_text', 'combined_text'] not found in axis"

In [31]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()


In [32]:
y = mlb.fit_transform(merged_df['tags'])

In [33]:
binary_df = pd.DataFrame(y, columns=mlb.classes_)

In [34]:
binary_df

Unnamed: 0,.net,ajax,android,angularjs,arrays,asp.net,asp.net-mvc,c,c#,c++,...,objective-c,php,python,r,regex,ruby,ruby-on-rails,sql,sql-server,xml
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
911492,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
911493,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
911494,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


***tf_idf***

In [36]:
tfidf = TfidfVectorizer(
    max_features=10000,# could change but takecare of crash
    stop_words='english',
    ngram_range=(1, 2),# you can make it 2 but could get crash due to ram
)

In [37]:
clean_texts = list(tqdm(merged_df['combined_text'], desc="Loading text data"))


Loading text data: 100%|██████████| 911496/911496 [00:00<00:00, 2517386.09it/s]


In [38]:
X = tfidf.fit_transform(tqdm(clean_texts, desc="TF-IDF Vectorizing"))

TF-IDF Vectorizing: 100%|██████████| 911496/911496 [04:29<00:00, 3383.98it/s]


donot forget label encoding

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, binary_df, test_size=0.2, random_state=42
)

models , classifier   

In [60]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler(with_mean=False)
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression(solver='saga', max_iter=1000))
clf.fit(X_train, y_train)


In [47]:
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy*100}")
print(f"Test Accuracy: {accuracy*100}")

Train Accuracy: 52.498230928310086
Test Accuracy: 51.058145913329675


In [49]:
joblib.dump(clf, 'logistic_ovr_model.pkl')

['logistic_ovr_model.pkl']