In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import csv
import json

from collections import defaultdict
from typing import List, Dict, Any

import pandas as pd

from bs4 import BeautifulSoup

In [None]:
def read_csv(path: str, row_names: List[str]) -> List[Dict[str, Any]]:
  data = []
  with open(path) as f:
    reader = csv.reader(f)
    for row in reader:
      entry = {}
      for name, value in zip(row_names, row):
        entry[name] = value
      data.append(entry)
  return data

def merge_question_answers(questions: List[Dict[str, Any]], answers: List[Dict[str, Any]]):
  answers_for_question = defaultdict(list)
  for answer in answers:
    answers_for_question[answer['questionId']].append(answer)

  merged = []
  for question in questions:
    merged.append({
        'question': question,
        'answers': answers_for_question[question['id']]
    })
  return merged

def save_json(data, path):
  with open(path, 'w') as f:
    json.dump(data, f)

def answers_to_collection(answer_path, collection_path):
  answer_rows = ['id', 'questionId', 'body', 'score']
  answers = sorted([[answer['id'], answer['body']] for answer in read_csv(answer_path, answer_rows)])
  with open(collection_path, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for answer in answers:
      writer.writerow(answer)

def answers_to_qrels(answer_path, qrels_path):
  answer_rows = ['id', 'questionId', 'body', 'score']
  answers = sorted([[answer['id'], answer['questionId']] for answer in read_csv(answer_path, answer_rows)])
  qrels = []
  for answer in answers:
    qrels.append([answer[1] + '_t', 0, answer[0], 1])
    qrels.append([answer[1] + '_q', 0, answer[0], 1])

  with open(collection_path, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for qrel in qrels:
      writer.writerow(qrel)

def questions_to_queries(question_path, query_path):
  question_rows = ['id', 'title', 'tags', 'body', 'acceptedAnswerId', 'score', 'views']
  raw_questions = read_csv(question_path, question_rows)
  questions = []

  for question in raw_questions:
    questions.append([question['id'] + '_t', question['title']])
    questions.append([question['id'] + '_q', question['body']])

  with open(query_path, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for question in questions:
      writer.writerow(question)

In [None]:
!ls drive

MyDrive  Shareddrives


In [None]:
answer_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/large_answers.csv'
collection_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/Training Data/collection.tsv'
answers_to_collection(answer_path, collection_path)

In [None]:
qrels_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/Training Data/qrels.tsv'
answers_to_qrels(answer_path, qrels_path)

In [None]:
question_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/large_questions.csv'
query_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/Training Data/queries.tsv'
questions_to_queries(question_path, query_path)

In [None]:
def creating_clean_questions(questions_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/questions_small.csv',answers_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data/small_answers.csv'):

  question_rows = ['id', 'title', 'tags', 'body', 'acceptedAnswerId', 'score', 'views']
  # questions = pd.read_csv(questions_path, question_rows)
  questions = pd.read_csv(questions_path,names = question_rows)

  questions["body"] = questions[['body']].applymap(lambda text : BeautifulSoup(text).get_text())

  answer_rows = ['id', 'questionId', 'title', 'body', 'score', 'views']
  # answers = pd.read_csv(answers_path, answer_rows)
  answers = pd.read_csv(answers_path,names = answer_rows)

  answers["body"] = answers[['body']].applymap(lambda text : BeautifulSoup(text).get_text())
  return questions, answers

In [None]:
merged_data = merge_question_answers(questions, answers)
save_json(merged_data, 'drive/Shared drives/685 Final Project/Stackoverflow Data/small_merged.json')

In [None]:
from sklearn.model_selection import train_test_split

def train_valid_test_por(questions,test_por = 0.2,valid_por = 0.1,state = 2):

  questions_train_val, questions_test = train_test_split(questions,test_size = test_por,random_state = state)
  questions_train, questions_valid = train_test_split(questions_train_val,test_size = valid_por/(1-test_por),random_state = state)
  return questions_train, questions_valid, questions_test

def train_valid_test_fix(questions,test_val = 50000,valid_val = 50000,state = 2):
  full_size = questions.size
  test_por = test_val/full_size
  valid_por = valid_val/full_size
  questions_train_val, questions_test = train_test_split(questions,test_size = test_por,random_state = state)
  questions_train, questions_valid = train_test_split(questions_train_val,test_size = valid_por/(1-test_por),random_state = state)
  return questions_train, questions_valid, questions_test

def create_qfiles(questions,path):
  q_train,q_valid,q_test = train_valid_test_fix(questions)
  q_train.to_csv(path + '/questions_train_sid.csv')
  q_valid.to_csv(path + '/questions_valid_sid.csv')
  q_test.to_csv(path + '/questions_test_sid.csv')


ques,ans = creating_clean_questions()
test_path = 'drive/Shared drives/685 Final Project/Stackoverflow Data'
create_qfiles(ques,test_path)

# help = {}
# answers_test = pd.DataFrame().reindex_like(answers)
# for q in questions_test:
#   help[q["questionID"]] = 1

# for p in answers:
#   if help.has_key[p["questionID"]]:
#     answers_test.append(p,ignore_index = True)