# Dataset 

This notebook creates a dataset used for all our experiments. Using a common dataset helps in robust evaluation of our learning algorithm, with respect to the omniscient policy. 

We create a dataset & save it for use by the learning algorithm. 

In [None]:
'''
This class holds student data. Its made to have all attributes of the students. StudentContext is meant to takes a subset 
of attributes from this class
'''
class Students:
    '''
    student_data created during data generation
    '''
    def setStudentsFeatures(self , student_data):
        self.studentsFeatures = student_data
    
    def getStudentsFeatures(self):
        return self.studentsFeatures
    
'''
This class holds content data. Its made to have all attributes of contents & topics. ContentContext takes a subset of 
attributes of content & topics
'''
class Content:
        
    def getContentData(self): # Rename to content data
        return self.contentsFeatures
    '''
    courseContent created during data generation
    '''
    def setContentData(self,courseContent):
        self.contentsFeatures = courseContent
        
    def getTopics(self):
        return self.topicContent
    '''
    topics created during data generation
    '''   
    def setTopics(self,topics):
        self.topicContent = topics

'''
Class that enscapulates student & content data generators. Its uses the StudentDataGen & ContentDataGen to create data. 
'''
class DataGenerator:
    
    def __init__(self):
        self.studentDataGen = StudentDataGen()
        self.contentDataGenerator = ContentDataGen()
        
    def createStudentData(self):
        self.studentData =  self.studentDataGen.create()

    def getStudentData(self):
        return self.studentData
    
    def createContentData(self):
        self.contentsFeatures = self.contentDataGenerator.getContentsFeatures() 
        self.topicContent = self.contentDataGenerator.getTopicContent()
        
    def getContentData(self):
        return self.contentsFeatures
    
    def getTopicData(self):
        return self.topicContent

'''
This is the student data generator
'''
class StudentDataGen:
    def __init__(self):
        self.number_of_students = 2 # Students taking the course. 
        self.student_context = ['video','audio','reading','kinesthetic'] # Student preferences
        # TO-DO Students preference to learning via various ways can also be evaluated on a scale of 10, rather being binary. 
        # Visual , text, demo-based , show \& do, step-by-step, activity/task based, lecture, audio, 
        # students self evaluate their readiness/motivation/excitement for the course. Student are rated on a scale of 10
        # pre-assessment of pre-requisites required for the course. Student are rated on a scale of 10
        # gender : while would be a binary value 
        # age : any number between 0 - 100. Normalized by dividing by 100. 
        # Other features, that can be captured in a live system: response times, correctness of answer, interactions, forgetfulness
        # TO-DO : Have student preferences & probability of having those preferences as a tuple. 
    
    def create(self):
        ## Create Student Context Data
        student_context_df = pd.DataFrame(data=np.random.binomial(1 , [0.7,0.6,0.5,0.4] , 
                            size=(self.number_of_students,len(self.student_context))) , columns = self.student_context)
        return student_context_df
    
'''
This is the content data generator
'''
class ContentDataGen:
    
    def __init__(self):
        self.number_of_topics = 5 # Number of topics in the course
        # Content features : 1. Ease of understanding, 2. surface level / simple / for intuition 3. how brief/concise is it, 
        # 4. how thorough does it cover the topic 5. how well rated is it 6. indepth / thorough / technical 7. theoritic 
        # 8. practical / hands-on 9. experimental / task-based. 
        # how interactive is it, how hands on is it, 
        # Topic Features : 
        #     1. Basic / Optional : Basic is one that is necessary, like a basic need of life. 
        #     2. 
        
        # 
        # Teacher's prior/preference on a scale of 10. This could be inferred as ranking
        # Type: Video, Audio, Reading, Short film, television clip, books, blog, articles, music, kinesthetics, 
        #       engagement level or interactive content , quiz, task-based, 
        # About Topic : Importance of topic , 
        self.content_context = ['A','B','C','D','E','F'] # Content features. Add meaningful features.
        self.prob_content_context = [0.8,0.7,0.6,0.5,0.4,0.3]
        self.no_contents_per_topic = np.random.randint(2,5,number_of_topics) # Variable number of contents per topic.
    
    def create(self):
        all_contents = list()
        topic_content = {}
        for i,j in enumerate(self.no_contents_per_topic):
            topic_id = "T_" + str(i+1) # e.g : T_10
            content_ids = [] # Temporary variable to help map topic to content. 
            for j_1 in range(1,j+1) : # Number of contents
                c_id = 'C_' + str(i+1) + '_' + str(j_1) # e.g : C_10_2 : Content number 2 for topics 10
                content_ids.append(c_id)
                all_contents.append(c_id)
            topic_content[topic_id] = content_ids   
        return topic_content , all_contents
    
    # Content related features
    def getContentsFeatures(self):
        self.topic_content , self.all_contents = self.create()
        content_context_df = pd.DataFrame(data=np.random.binomial(1 , self.prob_content_context, 
                             size=(sum(self.no_contents_per_topic),len(self.content_context))) , 
                             columns = self.content_context , index=self.all_contents)
        return content_context_df
    
    def getTopicContent(self):
        return self.topic_content