In [1]:
# import libraries

import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
file_questions = "/content/drive/MyDrive/nlp/classification/data/questions.csv"
file_tags = "/content/drive/MyDrive/nlp/classification/data/tags.csv"
encodings = ["utf-8", "latin1", "utf-16"]

In [None]:
# read data
for encode in encodings:
  try:
    dfq = pd.read_csv(file_questions, encoding = encode)
    print("File reading succesful with ", encode)
    break
  except UnicodeDecodeError:
    continue

dfq.shape

In [None]:
# read data
for encode in encodings:
  try:
    dft = pd.read_csv(file_tags, encoding = encode)
    print("File reading succesful with ", encode)
    break
  except UnicodeDecodeError:
    continue

dft.shape

In [None]:
df = dfq.merge(dft, on = 'Id', how='left')
df.dropna(subset=['Tag'], inplace = True)

In [None]:
df = df.head(1000)

# Preprocessing

In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# preprocess text
def clean_text(text):

  text = re.sub(r'\d', ' ', text)
  text = re.sub(r'<.*?>', '', text)
  text = re.sub(r'[^a-zA-Z\s]', ' ', text)   # Remove non-alphanumeric characters and hyphens
  tokens = nltk.word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word.lower() not in stop_words] #stop words

  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]  # lemmatization
  #print(tokens)
  #print(' '.join(tokens))
  return ' '.join(tokens)

In [None]:
df['clean_tag'] = df['Tag'].astype(str)
df['clean_tag'] = df['clean_tag'].apply(clean_text)
df['clean_tag'] = df['clean_tag'].apply(lambda tags: tags.split(' ')[0] if pd.notna(tags) else ' ')
df['clean_tag'] = df['clean_tag'].str.strip()

In [None]:
top_tags = df['clean_tag'].value_counts().nlargest(20).index.tolist()
df = df[df['clean_tag'].isin(top_tags)]

In [None]:
df['clean_title'] = df['Title'].astype(str)
df['clean_title'] = df['clean_title'].apply(clean_text)

In [None]:
df['clean_body'] = df['Body'].astype(str)
df['clean_body'] = df['clean_body'].apply(clean_text)

In [None]:
df = df.drop(columns=['Id', 'OwnerUserId', 'CreationDate', 'Score', 'Title', 'Body', 'Tag'])
df = df.dropna()

In [None]:
df.head()

In [None]:
df.to_csv('df_clean.csv', index = False)
from google.colab import files
files.download('df_clean.csv')