dataset link on kaggle : https://www.kaggle.com/datasets/stackoverflow/stacksample

In [1]:
import numpy as np 
import pandas as pd 
import os
from bs4 import BeautifulSoup
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

**Reading Data**

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stacksample/Answers.csv
/kaggle/input/stacksample/Questions.csv
/kaggle/input/stacksample/Tags.csv


In [3]:
data_path = "/kaggle/input/stacksample/"

In [4]:
questions = pd.read_csv(data_path + "Questions.csv", encoding='latin1')
print(questions.shape)
questions.head(10)

(1264216, 7)


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
5,330,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
6,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...
7,580,91.0,2008-08-02T23:30:59Z,,21,Deploying SQL Server Databases from Test to Live,<p>I wonder how you guys manage deployment of ...
8,650,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...
9,810,233.0,2008-08-03T20:35:01Z,,9,Visual Studio Setup Project - Per User Registr...,<p>I'm trying to maintain a Setup Project in <...


In [5]:
answers = pd.read_csv(data_path + "Answers.csv", encoding='latin1')
print(answers.shape)
answers.head(10)

(2014516, 6)


Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."
5,332,59.0,2008-08-02T03:00:24Z,330,19,<p>I would be a bit reluctant to use nested cl...
6,344,100.0,2008-08-02T04:18:15Z,260,6,<p>You might be able to use IronRuby for that....
7,359,119.0,2008-08-02T06:16:23Z,260,5,"<P>You could use any of the DLR languages, whi..."
8,473,49.0,2008-08-02T15:33:13Z,470,8,"<p>No, what you're doing is fine. Don't let th..."
9,529,86.0,2008-08-02T18:16:07Z,180,3,<p>Isn't it also a factor which order you set ...


In [6]:
tags = pd.read_csv(data_path + "Tags.csv", encoding='latin1')
print(tags.shape)
tags.head(10)

(3750994, 2)


Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
5,90,branch
6,90,branching-and-merging
7,120,sql
8,120,asp.net
9,120,sitemap


In [7]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264216 entries, 0 to 1264215
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Id            1264216 non-null  int64  
 1   OwnerUserId   1249762 non-null  float64
 2   CreationDate  1264216 non-null  object 
 3   ClosedDate    55959 non-null    object 
 4   Score         1264216 non-null  int64  
 5   Title         1264216 non-null  object 
 6   Body          1264216 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 67.5+ MB


In [8]:
print(questions.notnull().sum()/questions.shape[0])

Id              1.000000
OwnerUserId     0.988567
CreationDate    1.000000
ClosedDate      0.044264
Score           1.000000
Title           1.000000
Body            1.000000
dtype: float64


In [9]:
answers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2014516 entries, 0 to 2014515
Data columns (total 6 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Id            int64  
 1   OwnerUserId   float64
 2   CreationDate  object 
 3   ParentId      int64  
 4   Score         int64  
 5   Body          object 
dtypes: float64(1), int64(3), object(2)
memory usage: 92.2+ MB


In [10]:
print(answers.notnull().sum()/answers.shape[0])

Id              1.000000
OwnerUserId     0.993448
CreationDate    1.000000
ParentId        1.000000
Score           1.000000
Body            1.000000
dtype: float64


In [11]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750994 entries, 0 to 3750993
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 57.2+ MB


In [12]:
print(tags.notnull().sum()/tags.shape[0])

Id     1.000000
Tag    0.999703
dtype: float64


**preprocessing**

Nulls

In [13]:
tags=tags.dropna()

Number of unique tags

In [14]:
len(tags['Tag'].unique())

37034

In [15]:
sorted_tags = tags.groupby('Tag').size().reset_index(name='Count') \
                 .sort_values('Count', ascending=False)

In [23]:
print(sorted_tags.tail(30000))

                 Tag  Count
3409   bidirectional     34
28663     selectlist     34
3360     berkeley-db     34
19287            mdf     34
12652          gmock     34
...              ...    ...
31989           tbcd      1
10167     evil-dicom      1
10169            evo      1
31986           tbar      1
24456         pjsua2      1

[30000 rows x 2 columns]


In [51]:
top_tags = sorted_tags.head(60)['Tag']
filtered_tags = tags[tags['Tag'].isin(top_tags)]

In [49]:
print(filtered_tags.head())

     Id      Tag
7   120      sql
8   120  asp.net
14  260       c#
15  260     .net
18  330      c++


1057478

In [52]:
print(len(filtered_tags['Id'].unique()))

1009697


In [17]:
stop

NameError: name 'stop' is not defined

In [None]:
string_tags_df = tags.groupby("Id", as_index=False)["Tag"].agg(" ".join)

In [None]:
len(string_tags_df)

In [None]:
string_tags_df.head(5)

In [None]:
len(string_tags_df['Tag'].unique())

In [None]:
full_tag_counts = string_tags_df['Tag'].value_counts().reset_index(name='Count')
full_tag_counts.columns = ['Tag', 'Count']
sorted_full_tag_counts = full_tag_counts.sort_values('Count', ascending=False)

wrong approach

In [None]:
print(sorted_full_tag_counts.tail(650000))

In [None]:
count_other_tags = sorted_full_tag_counts.iloc[1000:]['Count'].sum()
count_other_tags

In [None]:
answer_counts = answers.groupby("ParentId").size().to_dict()
value_counts = Counter(answer_counts.values())
sorted_counts = dict(sorted(value_counts.items()))
print(sorted_counts)

**not include answers with zero or negative score**

In [None]:
answer_counts = answers[answers['Score']>0].groupby("ParentId").size().to_dict()
value_counts = Counter(answer_counts.values())
sorted_counts = dict(sorted(value_counts.items()))
print(sorted_counts)

add new column is the questions has link or now and the question after remove html tags

In [None]:
def remove_html_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    has_link = soup.find('a', href=True) is not None 
    text = soup.get_text(separator=' ', strip=True)
    return has_link, text

***feature engineering***

In [None]:
questions['has_link'], questions['clean_text'] = zip(*questions['Body'].apply(remove_html_tags))

In [None]:
df['combined_text'] = df['title'] + "\n" + df['clean_text']

In [None]:
questions.drop(columns=['Body','title'], inplace=True) # can make it questions if we will use it 

In [None]:
answers['has_link'], answers['clean_text'] = zip(*answers['Body'].apply(remove_html_tags))

In [None]:
answers.drop(columns=['Body'], inplace=True) 

In [None]:
questions['text_length'] = questions['clean_text'].apply(len)

In [None]:
questions['word_count'] = questions['clean_text'].apply(lambda x: len(x.split()))

hyperparameters to play with

In [None]:
questions = questions[questions['Score'] > 0]

In [None]:
answer_counts = answers[answers['Score']>0]

we can add new feature by add column of numbers of answers 

In [None]:
answers_grouped = answers.groupby('ParentId')['Body'].apply(lambda x: ' '.join(x)).reset_index()
answers_grouped.columns = ['Id', 'AllAnswers']

In [None]:
questions = questions.merge(answers_grouped, on='Id', how='left')

In [None]:
print(questions.notnull().sum()/answers.shape[0])

In [None]:
print(answers.notnull().sum()/answers.shape[0])

***tf_idf***

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:

tfidf = TfidfVectorizer(
    max_features=3000,# could change but takecare of crash
    stop_words='english',
    ngram_range=(1, 1)# you can make it 2 but could get crash due to ram
    preprocessor=remove_punctuation #
)

In [None]:
X = tfidf.fit_transform(questions['clean_text'])#u can add other feature here 

In [None]:
X_question_included = tfidf.fit_transform(questions['combined_text'])#u can add other feature here 

donot forget label encoding

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

models , classifier   