In [1]:
import glob
import pickle
import pandas as pd
import numpy as np
import scipy
import nltk
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn.decomposition import LatentDirichletAllocation 
from gensim import corpora, models, similarities
import jieba
import re

## Formatting the input data

In [324]:
#Posts & post titles from df --> lists of strings
#Each string in list is the full text of the post or title, will be parsed later

dfs = glob.glob('data/dataframes/*.p')
ans_dfs = glob.glob('data/answers/*.p')

frames = []
ans_frames = []

for cdf in dfs:
    frames.append(pd.read_pickle(cdf))
for cdf in ans_dfs:
    ans_frames.append(pd.read_pickle(cdf))
    
df = pd.concat(frames)
df = df[df.is_student == True]
dataList = df["text"].tolist()
subjectList = df["subject"].tolist()

ans = pd.concat(ans_frames)
ans.head()

#Randomizing order if later want to split into training and test inputs
# z = list(zip(dataList, subjectList))
# random.shuffle(z)
# dataList[:], subjectList[:] = zip(*z)

Unnamed: 0,id,created,type,folders,tags,is_announcement,history,children,tag_good,is_student,no_answer,num_children,num_favorites,num_revisions,unique_views,subject,text,word_count,answer
0,jau1tihlbb82ou,2017-12-05 19:59:43+00:00,note,[general],"[general, instructor-note, pin]",1,"[{'anon': 'no', 'uid': 'hbpawagy1su7ok', 'subj...","[{'anon': 'stud', 'folders': [], 'data': {'emb...",[],False,,1,1,1,244,Last Call For Style Regrade Requests,Any code style regrade requests should be in ...,210,[]
1,jatlo9mj8eh16r,2017-12-05 12:27:44+00:00,note,[exams],"[exams, instructor-note, pin]",1,"[{'anon': 'no', 'uid': 'hbpawagy1su7ok', 'subj...",[],[],False,,0,0,2,268,Final Exam Logistics,The final exam for our class will be at 8 AM ...,1073,[]
2,ja2v62y52r47iw,2017-11-16 19:23:45+00:00,note,[general],"[general, instructor-note, pin]",1,"[{'anon': 'no', 'uid': 'hbpawagy1su7ok', 'subj...",[],[],False,,0,0,1,258,Material for recitation tomorrow,Below are links to the materials you&#39;ll n...,167,[]
3,ja2josndfk6z7,2017-11-16 14:02:23+00:00,note,[general],"[general, instructor-note, pin]",1,"[{'anon': 'no', 'uid': 'hbpawagy1su7ok', 'subj...",[],[],False,,0,2,1,257,Grading issue punchlist form,If you have any sort of grade / course issue ...,361,[]
4,j9urk22tlgx5jy,2017-11-11 03:20:29+00:00,note,[assignment7],"[assignment7, instructor-note, pin]",1,"[{'anon': 'no', 'uid': 'hbpawagy1su7ok', 'subj...","[{'anon': 'stud', 'folders': [], 'data': {'emb...","[{'role': 'student', 'name': 'Justin Do', 'adm...",False,,1,0,2,260,Autograder machine having problems,Something seems to be wrong with the machine....,225,[]


## Constructing the corpus

In [321]:
#source: https://medium.com/better-programming/introduction-to-gensim-calculating-text-similarity-9e8b55de342d

texts = dataList
texts = [jieba.lcut(text) for text in texts] #tokenize text of post, turns string into list of substrings
dictionary = corpora.Dictionary(texts) #make dictionary where postId --> list of substrings
feature_cnt = len(dictionary.token2id) #get doc count for later
corpus = [dictionary.doc2bow(text) for text in texts] #corpus is bag of words from docs in dictionary
tfidf = models.TfidfModel(corpus) #construct tf-idf model on corpus

#repeat the above but for post titles
subjects = subjectList
subjects = [jieba.lcut(subject) for subject in subjects]
subDict = corpora.Dictionary(subjects)
sub_fc = len(subDict.token2id)
subCorpus = [subDict.doc2bow(subject) for subject in subjects]
subTfidf = models.TfidfModel(subCorpus)

## Running the comparison

In [323]:
### If you want to input in the command line
print("Title:")
subKw = input()
print("Post:")
keyword = input()
#####

### Sample inputs for demo
# keyword = "I used the instructions on Sakai for adding the junit tests but I'm still getting an error. Does anyone know how to fix this?"
# subKw = "Error adding Junit tests"
# keyword = "Is there going to be an curve on the midterm or are our gradescope scores the final grade?"
# subKw = "Exam grades"
# keyword = "Will the questions on the final be similar to the types of questions on the midterm?"
# subKw = "Final exam format"
# keyword = "For ShortButFairDispatcher, I'm not sure what it's asking. How are we supposed to determine which driver is the closest if the closest driver was in the last 5?"
# subKw = "ShortestButFairDispatcher"
#####


#source: https://medium.com/better-programming/introduction-to-gensim-calculating-text-similarity-9e8b55de342d
kw_vector = dictionary.doc2bow(jieba.lcut(keyword)) #cuts input phrase into sparse vector
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt) #calculates vector similarity
sim = index[tfidf[kw_vector]] #gets similarity score of keyword vector to each item in corpus

#repeats above block for post titles
subKw_vector = subDict.doc2bow(jieba.lcut(subKw))
subIndex = similarities.SparseMatrixSimilarity(subTfidf[subCorpus], num_features = sub_fc)
subSim = subIndex[subTfidf[subKw_vector]]     

#prints out similar posts from each of 3 formulas
print("These posts might be similar to what you're looking for: \n")

for i in range(len(sim)):
    if sim[i]*1+subSim[i]*0.5 > balanced: #
        balanced = sim[i]*1+subSim[i]*0.5
        balanced_id = i
    if sim[i]*1+subSim[i]*1 > sub_bias:
        sub_bias = sim[i]*1+subSim[i]*1
        sub_bias_id = i   
    if sim[i]*1+subSim[i]*0.1 > post_bias:
        post_bias = sim[i]*1+subSim[i]*0.1
        post_bias_id = i  

print('keyword is similar to text%d: %.2f' % (balanced_id, balanced))
print(dataList[balanced_id])
print("\n")
    

print('keyword is similar to text%d: %.2f' % (sub_bias_id, sub_bias))
print(dataList[sub_bias_id])
print("\n")
      
        
print('keyword is similar to text%d: %.2f' % (post_bias_id, post_bias))
print(dataList[post_bias_id])
print("\n")

Title:
Error adding JUnit tests
Post:
I used the instructions on Sakai for adding the junit tests but I'm still getting an error. Does anyone know how to fix this?
These posts might be similar to what you're looking for: 

keyword is similar to text1744: 0.37
 Can someone explain how to put the Junit tests into eclipse? I&#39;m having trouble getting them to run and i don&#39;t know if I&#39;m doing something completely wrong - I made sure to add the junit test to the classpath and made sure the a6 project is linked to it as well. any help would be appreciated! also big thank you to the people who shared their junit test code on piazza! 


keyword is similar to text1968: 0.62
 I followed the a3 grader steps to add the Junit tests to work for a4, and a4 is in the class path, but the line that imports a4 is giving me an error. How do I fix this?      Edit: This is the error message that pops up. I have tried hovering over and manually clicking &#34;fix project setup&#34; as well.       S