In [2]:
import numpy as np
import json
import pandas as pd
import re
import math
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.models import Sequential

from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer


In [3]:
with open('Data/data.json', 'r', encoding="utf-8") as handle:
    dataf = json.load(handle)

testConversation = list(dataf[3].keys())[0]
print(testConversation)

W: So, I've had a look at your résumé, and you seem to have the relevant work experience. I've also checked your references, and your former employers all have nothing but good things to say about you. ---M: That's good to hear. I've been very fortunate to have worked with some great companies, under strong and inspiring leadership. ---W: Well, I just have a couple of questions for you. Where do you see yourself five years from now? ---M: Well, at my age, I'm really looking for a job that can offer me some security. Most importantly, I want a job that can give me the opportunity to build a successful career. (12) (13) I don't want to put myself in a position where I am unable to reach my full potential. (13)---W: I understand. You're open to movement within the company? ---M: Exactly. In fact, the reason I left my last two jobs was because there was no room for promotion. (13)---W: You seem very determined. We like that. I'm certain you will be given every opportunity when the appropri

In [4]:
np.random.seed(400)  # We fix the random seed to ensure we get consistent results when we repeat the lab.

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in simple_preprocess(text) :  # Tokenize, remove very short and very long words, convert to lower case, remove words containing non-letter characters
        if token not in STOPWORDS:
            result.append(WordNetLemmatizer().lemmatize(token, 'v'))
            
    return result

processed_convs = []
for data in tqdm(dataf):
    conv = list(data.keys())[0]
    conv = re.sub(r'\(*\d*\)', r'',conv)
    processed_convs.append(preprocess(conv))

convs_dictionary = Dictionary(processed_convs) # construct word<->id mappings - it does it in alphabetical order
print(convs_dictionary)

bow_corpus = [convs_dictionary.doc2bow(doc) for doc in processed_convs]
# print(bow_corpus[3])

# Define a LDA model
num_topics = 5
lda_model =  LdaModel(bow_corpus, 
                      num_topics=num_topics, 
                      id2word=convs_dictionary,                        
                    )

doc_topics = lda_model.__getitem__(bow_corpus)
print(len(doc_topics))

100%|██████████████████████████████████████████████████████████████████████████| 12447/12447 [00:06<00:00, 1972.67it/s]


Dictionary<11370 unique tokens: ['appointment', 'clock', 'day', 'douglas', 'go']...>
12447


In [5]:
doc_topics = lda_model.__getitem__(bow_corpus)
print(doc_topics[2])

topics_distribution = []
for one_doc in doc_topics:
    temp = []
    for i in range(len(one_doc)):
        temp.append(one_doc[i][1])
    
    max_probs_index = np.argsort(temp)[-1]
    max_topics = one_doc[max_probs_index][0]
    topics_distribution.append(max_topics)
    
print(topics_distribution)

[(4, 0.9777255)]
[4, 4, 4, 2, 0, 4, 3, 0, 3, 3, 1, 0, 0, 4, 4, 4, 4, 4, 0, 4, 0, 4, 3, 1, 0, 0, 1, 0, 3, 4, 4, 1, 4, 2, 2, 4, 4, 4, 3, 0, 3, 0, 4, 4, 4, 2, 4, 0, 0, 0, 4, 1, 2, 0, 1, 4, 1, 4, 4, 0, 2, 1, 4, 3, 4, 0, 4, 4, 2, 0, 4, 4, 3, 4, 2, 4, 1, 0, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 4, 2, 0, 3, 4, 4, 1, 0, 4, 4, 3, 4, 4, 4, 4, 0, 0, 4, 0, 2, 0, 0, 3, 4, 3, 4, 2, 2, 4, 4, 0, 4, 4, 0, 4, 0, 4, 4, 4, 4, 0, 0, 0, 2, 4, 1, 4, 4, 4, 4, 0, 0, 0, 1, 4, 4, 2, 3, 4, 4, 4, 4, 0, 4, 0, 3, 1, 4, 3, 3, 0, 2, 3, 1, 4, 0, 1, 3, 0, 4, 0, 3, 4, 1, 3, 3, 3, 2, 1, 4, 4, 3, 0, 1, 0, 1, 4, 4, 2, 2, 0, 1, 4, 2, 4, 3, 3, 0, 4, 0, 3, 0, 0, 2, 4, 4, 1, 0, 1, 0, 4, 3, 4, 4, 4, 3, 2, 0, 0, 0, 1, 1, 3, 4, 0, 0, 0, 4, 4, 2, 0, 4, 1, 2, 4, 2, 0, 2, 0, 2, 3, 4, 0, 4, 0, 3, 0, 2, 3, 0, 0, 0, 4, 1, 4, 0, 0, 1, 0, 4, 4, 1, 3, 0, 4, 0, 4, 4, 4, 4, 2, 0, 0, 1, 4, 4, 1, 0, 4, 3, 4, 2, 0, 4, 4, 3, 4, 0, 2, 4, 0, 4, 4, 0, 4, 4, 4, 4, 4, 2, 0, 4, 2, 4, 0, 0, 4, 0, 0, 3, 3, 0, 3, 2, 4, 4, 3, 0, 0, 4, 4, 4, 4, 0, 4, 4, 4, 0, 2, 4

In [6]:
# material_ques_data = 
# material_ques_data = pd.read_csv("Data/Diffculty Level/Materials_Difficulty_index.csv")
material_ques_data = pd.read_csv("Data/Diffculty Level/Mateials_diffculty_by_cluster.csv")

# material_ques_data = material_ques_data.rename(columns={"Unnamed: 0":"Material Index"})

# display(material_ques_data)

question_num = []
for data in tqdm(dataf):
    ques = list(data.values())[0]
    question_num.append(len(ques))

material_ques_data["Questions Numbers"] = question_num
material_ques_data["Topics"] = topics_distribution
display(material_ques_data)

100%|███████████████████████████████████████████████████████████████████████| 12447/12447 [00:00<00:00, 1234663.27it/s]


Unnamed: 0,Materials Difficulty,Questions Numbers,Topics
0,0.01,1,4
1,-0.99,1,4
2,1.01,2,4
3,2.01,3,2
4,2.02,3,0
...,...,...,...
12442,-2.77,1,4
12443,1.14,2,0
12444,-1.56,1,0
12445,-2.76,1,0


In [7]:
# material_ques_data.to_csv("Data/Material Features For ML/Material_Features_For_ML.csv, index=False")
material_ques_data.to_csv("Data/Material Features For ML/Material_Features_For_ML.csv")

In [9]:
m = pd.read_csv("Data/Material Features For ML/Material_Features_For_ML.csv")
m1 = m.rename(columns={"Unnamed: 0":"Material Index"})
display(m1)
m1.to_csv("Data/Material Features For ML/Material_Features_For_ML.csv")

Unnamed: 0,Material Index,Materials Difficulty,Questions Numbers,Topics
0,0,0.01,1,4
1,1,-0.99,1,4
2,2,1.01,2,4
3,3,2.01,3,2
4,4,2.02,3,0
...,...,...,...,...
12442,12442,-2.77,1,4
12443,12443,1.14,2,0
12444,12444,-1.56,1,0
12445,12445,-2.76,1,0


In [23]:
def loadMaterialsFeaturesForML(path):
    MaterialsFeaturesForML = pd.read_csv(path)
    return MaterialsFeaturesForML


def getQuestionsDifficulty(dataf, materialDifficultyLevelNor):
    questionsDifficultyLevelNor = []
    for i, data in tqdm(enumerate(dataf)):
        ques = list(data.values())[0]   
        questionsDifficultyLevelNor.append([materialDifficultyLevelNor[i]]*len(ques))
    
    return questionsDifficultyLevelNor

def getQuestionsScore(questionsDifficultyLevelNor):
    question_score = []
    material_score = []
    question_score_int = []
    material_score_int = []    
    for quesdif in questionsDifficultyLevelNor:
        temp = []
        for onequesdif in quesdif:
            temp.append(onequesdif)
        material_score.append(sum(temp))
        question_score.append(temp)
    
    min_scores = (min(material_score) * (-1))+1
    for i, scores in  enumerate(question_score):
        material_score_int.append(material_score[i] + min_scores)
        temp_int = []
        for onescore in scores:
            temp_int.append(onescore + min_scores)
        question_score_int.append(temp_int)
        
    return question_score_int, material_score_int

def getProbOfQuestions(ability, CandidateMaterialsDifficultyLevel):
    basic_prob = 1/len(CandidateMaterialsDifficultyLevel)
    question_center_1_range = int(len(CandidateMaterialsDifficultyLevel)/40)
    question_center_2_range = int(len(CandidateMaterialsDifficultyLevel)/20)
    question_center_3_range = int(len(CandidateMaterialsDifficultyLevel)/10)
    

    diff_ability_close_level = []
    for quesdif in CandidateMaterialsDifficultyLevel:
        diff_ability_close_level.append(abs(quesdif-ability))
    
    diff_ability_close_level_array = np.array(diff_ability_close_level)
    diff_ability_close_level_array_sorted = np.argsort(diff_ability_close_level_array)
    
    _1_probs = 4*(1/len(CandidateMaterialsDifficultyLevel))
    _2_probs = 3*(1/len(CandidateMaterialsDifficultyLevel))
    _3_probs = 2*(1/len(CandidateMaterialsDifficultyLevel))
    
    priority_probs = question_center_1_range*_1_probs + question_center_2_range*_2_probs + question_center_3_range*_3_probs
    edge_probs = (1-priority_probs)/(len(CandidateMaterialsDifficultyLevel) - question_center_1_range - question_center_2_range - question_center_3_range)
        
    probs_distribution = np.zeros((len(CandidateMaterialsDifficultyLevel)))
    
    for i in diff_ability_close_level_array_sorted[:question_center_1_range]:
        probs_distribution[i] = _1_probs 
    
    for i in diff_ability_close_level_array_sorted[question_center_1_range:question_center_2_range]:
        probs_distribution[i] = _2_probs
        
    for i in diff_ability_close_level_array_sorted[question_center_2_range:question_center_3_range]:
        probs_distribution[i] = _3_probs
    
    for i in diff_ability_close_level_array_sorted[question_center_3_range:]:
        probs_distribution[i] = edge_probs
    
    return probs_distribution

# 将已知的数据进行的训练，然后将所有的推荐候选的题目(特征)输入进逻辑回归模型，模型将会进行二分类，最终模型将会返回每个题目的y=1/0的概率，
# 获取所有的题目的y=1的概率，然后结合分数（Si）以及题目概率(P(i))，计算期望。

# 题目特征，初步定义为：1. 一个听力材料的试题数量。 2. 听力材料的主题类别。 3.听力试题的难度。
def Logistic_Regression(xtrain, ytrain, xtest):
    ytrain_0_1 = []
    if len(set(ytrain)) == 1:
        lr = LogisticRegression()
        print(1)
    else:
        lr = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg')
    
    for y in ytrain:
        if y > 0.5:
            ytrain_0_1.append(1)
        else:
            ytrain_0_1.append(0)
    lr_model = lr.fit(xtrain, ytrain_0_1)
    y_predict_prob = lr_model.predict_proba(xtest)
    y_predict_label = lr_model.predict(xtest)
    return lr_model, y_predict_prob

def NN(xtrain, ytrain, xtest, lr, decay):
    
    input_shape = xtrain.shape[1]
    model = Sequential([
        Dense(units=32, activation='relu', input_shape=input_shape)
        Dense(units=32, activation='relu')
        Dense(units=2, activation='softmax')
    ])
    
    adam = Adam(learning_rate = lr, decay = decay)

    model.compile(optimizer=adam, loss='BinaryCrossentropy')
    
    model.fit(x_train, ytrain)
    
    result_probs = model.predict(xtest)
    return result_probs
    
def Max_N_Expectation(scores, questions_probs, posterior_probs, n_expectatios):
    expectation_x_z = []
    posterior_y1_probs = posterior_probs[:,1]
    for i, _ in enumerate(scores):
        E = scores[i] * questions_probs[i] * posterior_y1_probs[i]
        expectation_x_z.append(E)
    
    expectation_x_z = np.argsort(expectation_x_z)
#     print(expectation_x_z)
    top_n_ques_index = expectation_x_z[(len(expectation_x_z) - n_expectatios):]
    return top_n_ques_index

In [9]:
path = "Data/Material Features For ML/Material_Features_For_ML.csv"
Material_features = loadMaterialsFeaturesForML(path)
display(Material_features)

Unnamed: 0,Material Index,Materials Difficulty,Questions Numbers,Topics
0,0,-1.000000,1,4
1,1,-1.000000,1,4
2,2,-0.963304,2,4
3,3,-0.203447,3,2
4,4,-0.162517,3,0
...,...,...,...,...
12442,12442,-1.000000,1,4
12443,12443,-0.963304,2,0
12444,12444,-1.000000,1,0
12445,12445,-1.000000,1,0


In [10]:
matDiff = pd.read_csv("Data/Diffculty Level/Materials_Difficulty.csv")
display(list(matDiff['Materials Difficulty']))
quesDiff = getQuestionsDifficulty(dataf, list(matDiff['Materials Difficulty']))
print(quesDiff)

[-1.0,
 -1.0,
 -0.9633037399721855,
 -0.2034472776617499,
 -0.1625168743434533,
 0.1563355154986641,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -0.1432166031398588,
 0.0359611833401274,
 -0.2497039306405754,
 -1.0,
 -1.0,
 -1.0,
 -0.9266074799443712,
 -0.9633037399721855,
 -0.9633037399721855,
 -1.0,
 -0.2614371333321859,
 -0.5335693922680903,
 -1.0,
 -1.0,
 -0.0285957418405922,
 -0.5048395636496057,
 -0.5014151254952836,
 -0.4251190592231699,
 -0.2171326442483268,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 0.0922598106130667,
 0.1460567850855831,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -0.4481460689413424,
 -1.0,
 -0.9633037399721855,
 -0.4918969361484669,
 -1.0,
 -1.0,
 -1.0,
 -0.3558746339099167,
 -0.4394378596310201,
 -1.0,
 -1.0,
 -0.4822574152395207,
 -1.0,
 -1.0,
 -1.0,
 -0.263707290187662,
 0.1926510020893297,
 -0.1562620722888796,
 -0.4977278043535814,
 -1.0,
 -1.0,
 -1.0,
 -0.3462931753127328,
 -0.0724357455101906,
 -1.0,
 0.0214488505506709,
 -1.0,
 -1.0,
 0.0272741704708

12447it [00:00, 825960.76it/s]

[[-1.0], [-1.0], [-0.9633037399721855, -0.9633037399721855], [-0.2034472776617499, -0.2034472776617499, -0.2034472776617499], [-0.1625168743434533, -0.1625168743434533, -0.1625168743434533], [0.1563355154986641, 0.1563355154986641, 0.1563355154986641, 0.1563355154986641], [-1.0], [-1.0], [-1.0], [-1.0], [-0.1432166031398588, -0.1432166031398588, -0.1432166031398588], [0.0359611833401274, 0.0359611833401274, 0.0359611833401274, 0.0359611833401274], [-0.2497039306405754, -0.2497039306405754, -0.2497039306405754], [-1.0], [-1.0], [-1.0], [-0.9266074799443712, -0.9266074799443712, -0.9266074799443712], [-0.9633037399721855, -0.9633037399721855], [-0.9633037399721855, -0.9633037399721855], [-1.0], [-0.2614371333321859, -0.2614371333321859, -0.2614371333321859], [-0.5335693922680903, -0.5335693922680903], [-1.0], [-1.0], [-0.0285957418405922, -0.0285957418405922, -0.0285957418405922], [-0.5048395636496057, -0.5048395636496057], [-0.5014151254952836, -0.5014151254952836], [-0.4251190592231699




In [11]:
question_score, material_score = getQuestionsScore(quesDiff)
print(material_score)

[4.844376541614804, 4.844376541614804, 3.917769061670433, 5.2340347086295544, 5.356825918584445, 6.4697186036094605, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 5.414726732195228, 5.988221274975314, 5.095264749693078, 4.844376541614804, 4.844376541614804, 4.844376541614804, 3.0645541017816904, 3.917769061670433, 3.917769061670433, 4.844376541614804, 5.060065141618247, 4.777237757078623, 4.844376541614804, 4.844376541614804, 5.758589316093028, 4.834697414315593, 4.841546290624237, 4.569019363945294, 5.192978608869824, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 6.213415784067071, 6.428603681957137, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.948084403732119, 4.844376541614804, 3.917769061670433, 4.86058266931787, 4.844376541614804, 4.844376541614804, 4.844376541614804, 4.7767526

In [12]:
probs_distribution = getProbOfQuestions(ability=0, CandidateMaterialsDifficultyLevel=list(matDiff['Materials Difficulty']))
print(probs_distribution)

[5.35786829e-05 5.35786829e-05 5.35786829e-05 ... 5.35786829e-05
 5.35786829e-05 5.35786829e-05]


In [24]:
Xtrain = material_ques_data[0:5]
Ytrain = [1, 1, 1, 0, 1]
Xtest = material_ques_data[5:]

lr_model, predict_probs = Logistic_Regression(Xtrain, Ytrain, Xtest)
print(predict_probs)

1


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [34]:
top_n_questions = Max_N_Expectation(scores=material_score[5:], questions_probs=probs_distribution[5:], 
                                    posterior_probs=predict_probs, n_expectatios=5)

In [35]:
print(top_n_questions)

[56 33 32  6  0]
