# Recommendation System for LearnLink: A Language Learning Platform for Children aged 11-22 with Disabilities
- [Project Presentatation Deck]()

#### Author: Lillian Lakes
- [GitHub Profile](https://github.com/lillianlakes) 
- [LinkedIn Profile](https://www.linkedin.com/in/lillianlakes/) 

## Preliminary Steps

In [306]:
# Import required libraries
import random
import numpy as np
import pandas as pd 
from scipy import stats
from faker import Faker

## Create Synthetic Data

In [307]:
# Set random seeds for reproducibility
random.seed(0)
np.random.seed(0)

# Create a Faker instance for generating fake data
fake = Faker()
fake.seed_instance(0)

# Define languages and disabilities lists
# Languages are based on English (for English as a Second Language Students)
# and the top seven foreign languages taught as part of K-12 education
languages = ['Spanish', 'French', 'German', 'Latin', 'Japanese', 'Chinese', 'Russian', 'English']
disabilities = ['dyslexia', 'ADHD', 'color blind']

# Define test score range
score_min, score_max = 0, 100

# Define WCAG-based accessibility settings for different disability types, with appropriate average 
# test scores and score standard deviations
disabilities_settings = [
    # Accessibility settings for dyslexia
     {
      'disability_name': 'dyslexia',
      'accessibility_types': [
          # WCAG-based accessibility settings with high score averages and low standard deviations
         {
          'font_color' : 'charcoal',
          'background_color' : 'pastel light blue',
          'font_size' : 15,
          'font_weight' : 'regular',
          'font_family' : 'Trebuchet',
          'letter_spacing' : 5.25,	
          'line_spacing' : 22.5,
          'word_spacing' : 78.75,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : False,
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 85,
          'score_std': 10
         },
         {
          'font_color' : 'licorice',
          'background_color' : 'pastel light yellow',
          'font_size' : 14,
          'font_weight' : 'regular',
          'font_family' : 'Tahoma',
          'letter_spacing' : 4.9,	
          'line_spacing' : 21,
          'word_spacing' : 73.5,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : False,
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 80,
          'score_std': 5
         },
         {
          'font_color' : 'onyx',
          'background_color' : 'light cream',
          'font_size' : 12,
          'font_weight' : 'regular',
          'font_family' : 'Century Gothic',
          'letter_spacing' : 4.2,	
          'line_spacing' : 18,
          'word_spacing' : 63,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : False,
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 95,
          'score_std': 5
         },
         {
          'font_color' : 'matte black',
          'background_color' : 'light gray',
          'font_size' : 13,
          'font_weight' : 'regular',
          'font_family' : 'Open Sans',
          'letter_spacing' : 4.55,	
          'line_spacing' : 19.5,
          'word_spacing' : 68.25,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : False,
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 90,
          'score_std': 10
         },
         # Non-WCAG-based accessibility setting with a low score average and higher standard deviation
         {
          'font_color' : 'white',
          'background_color' : 'red',
          'font_size' : 8,
          'font_weight' : 'regular',
          'font_family' : 'Open Sans',
          'letter_spacing' : 2.55,	
          'line_spacing' : 7.5,
          'word_spacing' : 30.25,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : False,
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 50,
          'score_std': 15
         }
      ]
   },
   # Accessibility settings for ADHD
    {
      'disability_name': 'ADHD',
      'accessibility_types': [
          # WCAG-based accessibility settings with high score averages and low standard deviations
         {
          'font_color' : 'dark gray',
          'background_color' : 'eggshell',
          'font_size' : 16,
          'font_weight' : 'regular',
          'font_family' : 'Open Sans',
          'letter_spacing' : 1.92,	
          'line_spacing' : 28,
          'word_spacing' : 2.56,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : True,
          'lists with bullets': True,
          'score_avg': 95,
          'score_std': 5
         },
         {
          'font_color' : 'midnight blue',
          'background_color' : 'ivory',
          'font_size' : 12,
          'font_weight' : 'regular',
          'font_family' : 'Lexend',
          'letter_spacing' : 1.44,	
          'line_spacing' : 21,
          'word_spacing' : 1.92,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : True,
          'lists with bullets': True,
          'score_avg': 90,
          'score_std': 10
         },
         {
          'font_color' : 'charcoal',
          'background_color' : 'light gray',
          'font_size' : 14,
          'font_weight' : 'regular',
          'font_family' : 'Helvetica',
          'letter_spacing' : 1.68,	
          'line_spacing' : 24.5,
          'word_spacing' : 2.24,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : True,
          'lists with bullets': True,
          'score_avg': 85,
          'score_std': 10
         },
         # Non-WCAG-based accessibility setting with a low score average and higher standard deviation
         {
          'font_color' : 'yellow',
          'background_color' : 'orange',
          'font_size' : 9,
          'font_weight' : 'regular',
          'font_family' : 'Helvetica',
          'letter_spacing' : 1.18,	
          'line_spacing' : 12.5,
          'word_spacing' : 1.24,
          'text_alignment' : 'left aligned',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : False,
          'lists with bullets': False,
          'score_avg': 30,
          'score_std': 30
         }
      ]
   },
   # Accessibility settings for color blindness
    {
      'disability_name': 'color blind',
      'accessibility_types': [
          # WCAG-based accessibility settings with high score averages and low standard deviations
         {
          'font_color' : 'nightrider',
          'background_color' : 'quartz',
          'font_size' : 15,
          'font_weight' : 'bold',
          'font_family' : 'Poppins',
          'letter_spacing' : 1.8,	
          'line_spacing' : 22.5,
          'word_spacing' : 2.4,
          'text_alignment' : 'default',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 95,
          'score_std': 5
         },
         {
          'font_color' : 'black russian',
          'background_color' : 'prim',
          'font_size' : 18,
          'font_weight' : 'regular',
          'font_family' : 'Source Sans Pro',
          'letter_spacing' : 2.16,	
          'line_spacing' : 27,
          'word_spacing' : 2.88,
          'text_alignment' : 'default',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 90,
          'score_std': 10
         },
         {
          'font_color' : 'charcoal',
          'background_color' : 'linen',
          'font_size' : 24,
          'font_weight' : 'regular',
          'font_family' : 'Century Gothic',
          'letter_spacing' : 2.88,	
          'line_spacing' : 36,
          'word_spacing' : 3.84,
          'text_alignment' : 'default',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 85,
          'score_std': 10
         },
         # Non-WCAG-based accessibility setting with a low score average and higher standard deviation
         {
          'font_color' : 'green',
          'background_color' : 'yellow',
          'font_size' : 10,
          'font_weight' : 'regular',
          'font_family' : 'Century Gothic',
          'letter_spacing' : 1.88,	
          'line_spacing' : 15,
          'word_spacing' : 1.84,
          'text_alignment' : 'default',	
          'is_auto_play_tts' : 'default',
          'bionic_reading' : 'default',
          'lists with bullets': 'default',
          'score_avg': 45,
          'score_std': 15
         }
      ]
   },
]

# Function to generate user data
def generate_user_data(x):
    """
    Generate user data based on the specified number (x) of users.

    Parameters:
    - x (int): Number of users to generate

    Returns:
    - DataFrame: Dataframe containing user data
    """
    users_data = [
        {
            'id': 10000001 + i,
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'username': fake.email(),
            'date_of_birth': fake.date_of_birth(None, 11, 22),
            'native_language': languages[i % (len(languages) + 6)] if i % (len(languages) + 6) >= 0 and 
              i % (len(languages) + 6) < (len(languages) - 1) else languages[-1],
            'learning_language': languages[-1] if i % (len(languages) + 6) >= 0 and i % (len(languages) + 6) 
              < (len(languages) - 1) else languages[i % (len(languages) + 6) - (len(languages) - 1)],
            'disability': disabilities[i % len(disabilities)],
            'created_at': fake.date_time_between('-1d'),
        }
        for i in range(x)
    ]
    return pd.DataFrame(users_data)

# Function to generate accessibility settings data
def generate_settings_data(x):
    """
    Generate accessibility settings data based on the specified number (x) of settings.

    Parameters:
    - x (int): Number of settings to generate

    Returns:
    - DataFrame: Dataframe containing accessibility settings data
    """
    settings_data = []
    for i in range(x):
        dis_set_i = i % len(disabilities_settings)
        acc_i = random.randint(0, len(disabilities_settings[dis_set_i]['accessibility_types']) - 1)

        setting = {'id': 40000001 + i}
        
        disability_setting = disabilities_settings[dis_set_i]['accessibility_types'][acc_i]
        setting.update({
            'font_color': disability_setting['font_color'],
            'background_color': disability_setting['background_color'],
            'font_size': disability_setting['font_size'],
            'font_weight': disability_setting['font_weight'],
            'font_family': disability_setting['font_family'],
            'letter_spacing': disability_setting['letter_spacing'],
            'line_spacing': disability_setting['line_spacing'],
            'word_spacing': disability_setting['word_spacing'],
            'text_alignment': disability_setting['text_alignment'],
            'is_auto_play_tts': disability_setting['is_auto_play_tts'],
            'bionic_reading': disability_setting['bionic_reading'],
            'lists with bullets': disability_setting['lists with bullets'],
            'score': round(stats.truncnorm.rvs((score_min - disability_setting['score_avg']) / disability_setting['score_std'],
                                               (score_max - disability_setting['score_avg']) / disability_setting['score_std'],
                                               loc=disability_setting['score_avg'],
                                               scale=disability_setting['score_std']), 1)
        })

        settings_data.append(setting)

    return pd.DataFrame(settings_data)

# Function to generate user-settings relationship data   
def generate_user_settings_data(users_df, settings_df):
    """
    Generate user-settings relationship data based on user and settings dataframes.

    Parameters:
    - users_df (dataframe): Dataframe containing user data
    - settings_df (dataframe): Dataframe containing accessibility settings data

    Returns:
    - Dataframe: Dataframe containing user-settings relationship data
    """
    user_settings_data = [
        {'id': 70000001 + i, 'user_id': user.id, 'setting_id': setting.id}
        for i, (user, setting) in enumerate(zip(users_df.itertuples(), settings_df.itertuples()))
    ]
    return pd.DataFrame(user_settings_data)

# Generate Dataframes
users = generate_user_data(1000)
settings = generate_settings_data(1000)
user_settings = generate_user_settings_data(users, settings)

# Convert Data Types
users['id'] = users['id'].astype('int64')
users['date_of_birth'] = pd.to_datetime(users['date_of_birth'])
settings['id'] = settings['id'].astype('int64')
settings['font_size'] = settings['font_size'].astype('int64')
settings['is_auto_play_tts'] = settings['is_auto_play_tts'].astype('bool')
user_settings['id'] = user_settings['id'].astype('int64')
user_settings['user_id'] = user_settings['user_id'].astype('int64')
user_settings['setting_id'] = user_settings['setting_id'].astype('int64')


### Export DataFrames as CSV Files

In [308]:
users.to_csv('data/Users_Table.csv')
settings.to_csv('data/Settings_Table.csv')
user_settings.to_csv('data/User_Settings_Table.csv')

## Exploratory Data Analysis (EDA)

### Explore the Users data

In [309]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 1000 non-null   int64         
 1   first_name         1000 non-null   object        
 2   last_name          1000 non-null   object        
 3   username           1000 non-null   object        
 4   date_of_birth      1000 non-null   datetime64[ns]
 5   native_language    1000 non-null   object        
 6   learning_language  1000 non-null   object        
 7   disability         1000 non-null   object        
 8   created_at         1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(6)
memory usage: 70.4+ KB


In [310]:
users.head(7)

Unnamed: 0,id,first_name,last_name,username,date_of_birth,native_language,learning_language,disability,created_at
0,10000001,Megan,Chang,gwilliams@example.com,2007-08-21,Spanish,English,dyslexia,2023-09-29 18:14:19
1,10000002,Tammy,Howard,williamcampbell@example.org,2011-04-08,French,English,ADHD,2023-09-29 16:18:58
2,10000003,Vanessa,Patel,kyleblair@example.net,2006-05-13,German,English,color blind,2023-09-30 00:23:08
3,10000004,Anita,Gomez,cheryl38@example.com,2008-04-11,Latin,English,dyslexia,2023-09-30 02:10:38
4,10000005,Jorge,Trujillo,davismary@example.net,2012-02-11,Japanese,English,ADHD,2023-09-30 05:57:38
5,10000006,Aaron,Snyder,john51@example.org,2010-05-27,Chinese,English,color blind,2023-09-29 15:16:03
6,10000007,John,Ponce,udavis@example.net,2009-01-26,Russian,English,dyslexia,2023-09-29 11:09:58


### Explore the Settings data

In [311]:
settings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  1000 non-null   int64  
 1   font_color          1000 non-null   object 
 2   background_color    1000 non-null   object 
 3   font_size           1000 non-null   int64  
 4   font_weight         1000 non-null   object 
 5   font_family         1000 non-null   object 
 6   letter_spacing      1000 non-null   float64
 7   line_spacing        1000 non-null   float64
 8   word_spacing        1000 non-null   float64
 9   text_alignment      1000 non-null   object 
 10  is_auto_play_tts    1000 non-null   bool   
 11  bionic_reading      1000 non-null   object 
 12  lists with bullets  1000 non-null   object 
 13  score               1000 non-null   float64
dtypes: bool(1), float64(4), int64(2), object(7)
memory usage: 102.7+ KB


In [312]:
settings.head(7)

Unnamed: 0,id,font_color,background_color,font_size,font_weight,font_family,letter_spacing,line_spacing,word_spacing,text_alignment,is_auto_play_tts,bionic_reading,lists with bullets,score
0,40000001,matte black,light gray,13,regular,Open Sans,4.55,19.5,68.25,left aligned,False,default,default,89.0
1,40000002,yellow,orange,9,regular,Helvetica,1.18,12.5,1.24,left aligned,True,False,False,50.6
2,40000003,nightrider,quartz,15,bold,Poppins,1.8,22.5,2.4,default,True,default,default,95.1
3,40000004,onyx,light cream,12,regular,Century Gothic,4.2,18.0,63.0,left aligned,False,default,default,94.5
4,40000005,yellow,orange,9,regular,Helvetica,1.18,12.5,1.24,left aligned,True,False,False,30.8
5,40000006,green,yellow,10,regular,Century Gothic,1.88,15.0,1.84,default,True,default,default,50.6
6,40000007,onyx,light cream,12,regular,Century Gothic,4.2,18.0,63.0,left aligned,False,default,default,93.3


In [313]:
# settings.font_color.value_counts()

In [314]:
# settings.score.value_counts()

In [315]:
# settings.score.sort_values(ascending = False)

### Explore the User_Settings data

In [316]:
user_settings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   id          1000 non-null   int64
 1   user_id     1000 non-null   int64
 2   setting_id  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


In [317]:
user_settings.head(7)

Unnamed: 0,id,user_id,setting_id
0,70000001,10000001,40000001
1,70000002,10000002,40000002
2,70000003,10000003,40000003
3,70000004,10000004,40000004
4,70000005,10000005,40000005
5,70000006,10000006,40000006
6,70000007,10000007,40000007
