In [None]:
import pandas as pd
import numpy as np
import spacy
import preprocessing
from csv import reader 
import tfidf
import scipy


pd.set_option('max_colwidth', 5000)



In [None]:
df = pd.read_csv('DB/Original_data.csv', index_col=[0])
#df = df.sample(frac=0.2)
#Check if there are null values 
df.isna().sum()

In [None]:
#create column with answers aggregate by title of the questions
df2 = df 
func = lambda x: "\n".join(x)
df2 = df2.groupby('id')["answers"].agg([("answers",func)])



In [None]:
# concat answers for each post beacause there are a record with question duplicated for each answer to that question 


grouped = df.groupby(['id','title', 'body','tags'],as_index=False).agg("sum","score")

grouped_df = pd.DataFrame(grouped)
grouped_df = pd.merge(grouped_df, df2, left_on='id', right_on='id', how='left')
grouped_df



In [None]:
#Preprocessing Part
#Answers
#Removing tags


answers = grouped_df["answers"]
preprocessing.remove_tags(answers)



#Clearing text 
answers_processed = answers.apply(lambda x: preprocessing.clear_text(x))


In [None]:
answers_processed.head()
answers_processed.isna().sum() 
grouped_df['answers_processed'] = answers_processed

In [None]:
#Questions
#Merge title with body 
questions = grouped_df["body"]
preprocessing.remove_tags(questions)
questions

In [None]:
#Clearing text 
questions_processed = questions.apply(lambda x: preprocessing.clear_text(x))
grouped_df['questions_processed'] = questions_processed
questions_processed

In [None]:
#Create a column only for the processed title of the questions 
processed_title = grouped_df.title.apply(lambda x: preprocessing.clear_text(x))
grouped_df['processed_title'] = processed_title
processed_title 

In [None]:
#Create a column to add in db with question title, question body and all the answers processed, to train later the W2V embeddings
#post_corpus = processed_title + '\n '+ questions_processed + '\n ' + answers_processed
grouped_df.drop("answers", axis=1, inplace=True)
grouped_df.drop("body", axis=1, inplace=True)
#grouped_df["post_corpus"] = post_corpus
grouped_df["questions"] = questions
grouped_df

In [None]:
#Filter out the tags, selecting only the 20 most common for better processing 
# Convert raw text data of tags into lists
grouped_df["tags"] = grouped_df["tags"].apply(lambda x: x.split('|'))   

# Make a dictionary to count the frequencies for all tags
tag_freq_dict = {}

#i = 0 

for tags in grouped_df["tags"]:
    for tag in tags:
        #Remove tags python, python2.7 e python3 for further processing 
        if "python" not in tag :
            
            if tag not in tag_freq_dict:
                tag_freq_dict[tag] = 0
            else:
                tag_freq_dict[tag] += 1
        else:
            tags.remove(tag)
    '''
    #If a record has less than 3 tags eliminate it for better further processing (training of fastText)
    if len(tags) < 3:
        grouped_df.drop(i)
    i = i +1
    '''
grouped_df["tags"]


In [None]:
#Selecting the most common number of tags in our database 
import heapq
most_common_tags = heapq.nlargest(30, tag_freq_dict, key=tag_freq_dict.get)
most_common_tags

In [None]:
#Select only the data with at least one of the most common tags 
final_indices = []
for i,tags in enumerate(grouped_df["tags"].values.tolist()):
    if len(set(tags).intersection(set(most_common_tags)))>0:   # The minimum length for common tags should be 1
        final_indices.append(i)

final_data = grouped_df.iloc[final_indices]

final_data 

In [None]:
# Normalize numeric data for the scores
final_data['score'] = (final_data['score'] - final_data['score'].mean()) / (final_data['score'].max() - final_data['score'].min())

In [None]:
# Combine the lists back into text data
final_data['tags'] = final_data['tags'].apply(lambda x: '|'.join(x))

final_data

In [None]:
#Check if the final data has some null values 
final_data.isna().sum()

final_data = final_data.dropna()
#final_data = final_data[final_data['post_corpus'].notna()]
final_data = final_data[final_data['processed_title'].notna()]
final_data 

In [None]:
#Create matrix for tdfidf for further processing 
corpus = final_data['processed_title'] + final_data['questions_processed'] + final_data['answers_processed'] 
matrix = tfidf.create_matrix(corpus)
scipy.sparse.save_npz('DB/tfidf_stack_matrix.npz', matrix)

In [24]:
final_data = final_data[final_data['processed_title'].notna()]

final_data = final_data[final_data['questions_processed'].notna()]

final_data = final_data[final_data['answers_processed'].notna()]

In [25]:
# Save the data
final_data.to_csv('DB/Preprocessed_data.csv', index=False)