# National Taipei University of Technology
## Big Data Mining and Applications, Fall 2019
### FINAL PROJECT: Analyzing Student Performance Records

Team Members: Jason King (陳福國), Mao Zhi Heng (毛智恆)<br>
ID: 105590051, 106AEA002

In [1]:
from pyspark import SparkContext
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from itertools import islice
import timeit as t
import numpy as np
import re

def percent(x, y):  # calculates percentage
    perc = str(np.round(x*100/y, decimals=2))
    return perc + "%"

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
sc

## Part 0: Initialization

Preparation of data

### Loading the CSV into RDD

Note: To not complicate stuff, we will just work on the Portuguese dataset.

In [3]:
d = sc.textFile('student-por.csv').map(lambda k: k.split(";")).mapPartitionsWithIndex(lambda idx, it: islice(it, 1, None) if idx == 0 else it)

# for boolean values, we try to enumerate the values where neccessary
# everything is 0 and 1 for no and yes, respectively, unless specified otherwise

def chk(x): # yes/no
    if "yes" in x:
        return 1
    else:
        return 0

# UNUSED
def school_chk(x): # checks school data (x[0])
    if x == "GP": # Gabriel Pereira
        return 1
    return 2     # Mousinho da Silveira

def group_grades(x): # groups grades into four tiers
    if x>=17:
        return 3 # high pass
    if x>=13:
        return 2 # medium pass
    if x>=10:
        return 1 # low pass
    return 0     # fail

def cleandata(data):
    # data[0] = school_chk(data[0])
    data[2] = int(data[2])
    data[3] = re.sub('\"', '', data[3])
    data[5] = re.sub('\"', '', data[5])
    data[6] = int(data[6])
    data[7] = int(data[7])
    data[12] = int(data[12])
    data[13] = int(data[13])
    data[14] = int(data[14])
    for i in range(15, 23):
        data[i] = chk(data[i])
    for i in range(23, 33):
        data[i] = re.sub('\"', '', data[i])
        data[i] = int(data[i])
    for i in range(30, 33):
        data[i] = group_grades(data[i])
    return data
    
data = d.map(lambda x: cleandata(x))

In [4]:
data.take(3)

[['"GP"',
  '"F"',
  18,
  'U',
  '"GT3"',
  'A',
  4,
  4,
  '"at_home"',
  '"teacher"',
  '"course"',
  '"mother"',
  2,
  2,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  4,
  3,
  4,
  1,
  1,
  3,
  4,
  0,
  1,
  1],
 ['"GP"',
  '"F"',
  17,
  'U',
  '"GT3"',
  'T',
  1,
  1,
  '"at_home"',
  '"other"',
  '"course"',
  '"father"',
  1,
  2,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  5,
  3,
  3,
  1,
  1,
  3,
  2,
  0,
  1,
  1],
 ['"GP"',
  '"F"',
  15,
  'U',
  '"LE3"',
  'T',
  1,
  1,
  '"at_home"',
  '"other"',
  '"other"',
  '"mother"',
  1,
  2,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  4,
  3,
  2,
  2,
  3,
  3,
  6,
  1,
  2,
  1]]

### Assumptions

* Both schools use the same curriculum, use the same teaching materials, and the exams are set at the same difficulty level.
* The students' habits remain the same throughout the academic year.

## Execution Script

In [5]:
from operator import add

def key(x):
    if x==0:
        return "Failed"
    elif x==1:
        return "Low Pass"
    elif x==2:
        return "Med Pass"
    return "High Pass"

def run(data, tar, c):
    h = data.map(lambda x: (x[tar], 1)).reduceByKey(lambda a, b: a + b).sortByKey()
    p = h.map(lambda x: [key(x[0]), x[1], percent(x[1], c)])
     
    print("For G", tar, ", number of students having...", sep='')
    print(p.collect())
    
    # trying to use a points approach -- 3 points for high pass, 2 points for med pass, and 1 point for low pass
    points = h.map(lambda x: x[0]*x[1]).reduce(add)
    average = np.round(points/c, 2)
    return points, average

def splitdata(data, count):
    if count!=0:
        pg1, ag1 = run(data, 1, count)
        pg2, ag2 = run(data, 2, count)
        pg3, ag3 = run(data, 3, count)

        columns = ["G1 points", "G2 points", "G3 points"]
        sc.parallelize([Row(pg1, pg2, pg3)]).toDF(columns).show()

        columns = ["G1 average", "G2 average", "G3 average"]
        sc.parallelize([Row(str(ag1), str(ag2), str(ag3))]).toDF(columns).show()
    else:
        print("No data in this category")

## Part 1: Home Location

We will try to determine how a student's home location (urban/rural) affects a student's grades.

In [6]:
temp = data.map(lambda y: [y[i] for i in (3, 30, 31, 32)])
urban = temp.filter(lambda x: x[0]=="U")
rural = temp.filter(lambda x: x[0]=="R")

In [7]:
c = urban.count()
print("Number of students who live in the urban area: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(urban, c)

Number of students who live in the urban area: 452 (69.65%)

For G1, number of students having...
[['Failed', 91, '20.13%'], ['Low Pass', 191, '42.26%'], ['Med Pass', 150, '33.19%'], ['High Pass', 20, '4.42%']]
For G2, number of students having...
[['Failed', 86, '19.03%'], ['Low Pass', 191, '42.26%'], ['Med Pass', 149, '32.96%'], ['High Pass', 26, '5.75%']]
For G3, number of students having...
[['Failed', 56, '12.39%'], ['Low Pass', 179, '39.6%'], ['Med Pass', 183, '40.49%'], ['High Pass', 34, '7.52%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      551|      567|      647|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.22|      1.25|      1.43|
+----------+----------+----------+



In [8]:
c = rural.count()
print("Number of students who live in the rural area: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(rural, c)

Number of students who live in the rural area: 197 (30.35%)

For G1, number of students having...
[['Failed', 66, '33.5%'], ['Low Pass', 77, '39.09%'], ['Med Pass', 50, '25.38%'], ['High Pass', 4, '2.03%']]
For G2, number of students having...
[['Failed', 59, '29.95%'], ['Low Pass', 81, '41.12%'], ['Med Pass', 48, '24.37%'], ['High Pass', 9, '4.57%']]
For G3, number of students having...
[['Failed', 44, '22.34%'], ['Low Pass', 94, '47.72%'], ['Med Pass', 47, '23.86%'], ['High Pass', 12, '6.09%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      189|      204|      224|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.96|      1.04|      1.14|
+----------+----------+----------+



Students who live in the urban area perform better than students who live in the rural area.

Possible reasons: There are more facilities that aid students in the urban area to learn -- for example: library, museum, etc.

## Part 2: Cohabitation Status

We will try to determine how a student's cohabitation status (living together/apart) affects a student's grades.

In [9]:
temp = data.map(lambda y: [y[i] for i in (5, 30, 31, 32)])
together = temp.filter(lambda x: x[0]=="T")
apart = temp.filter(lambda x: x[0]=="A")

In [10]:
c = together.count()
print("Number of students who live together with parents: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(together, c)

Number of students who live together with parents: 569 (87.67%)

For G1, number of students having...
[['Failed', 140, '24.6%'], ['Low Pass', 232, '40.77%'], ['Med Pass', 174, '30.58%'], ['High Pass', 23, '4.04%']]
For G2, number of students having...
[['Failed', 126, '22.14%'], ['Low Pass', 243, '42.71%'], ['Med Pass', 168, '29.53%'], ['High Pass', 32, '5.62%']]
For G3, number of students having...
[['Failed', 88, '15.47%'], ['Low Pass', 241, '42.36%'], ['Med Pass', 199, '34.97%'], ['High Pass', 41, '7.21%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      649|      675|      762|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.14|      1.19|      1.34|
+----------+----------+----------+



In [11]:
c = apart.count()
print("Number of students who live separate from parents: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(apart, c)

Number of students who live separate from parents: 80 (12.33%)

For G1, number of students having...
[['Failed', 17, '21.25%'], ['Low Pass', 36, '45.0%'], ['Med Pass', 26, '32.5%'], ['High Pass', 1, '1.25%']]
For G2, number of students having...
[['Failed', 19, '23.75%'], ['Low Pass', 29, '36.25%'], ['Med Pass', 29, '36.25%'], ['High Pass', 3, '3.75%']]
For G3, number of students having...
[['Failed', 12, '15.0%'], ['Low Pass', 32, '40.0%'], ['Med Pass', 31, '38.75%'], ['High Pass', 5, '6.25%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       91|       96|      109|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.14|       1.2|      1.36|
+----------+----------+----------+



Since the difference on average is very small, we do not think a student's cohabitation status will affect a student's grades.

## Part 3: Travel Time

We will try to determine how a student's travel time from home to school affects a student's grades.

In [12]:
temp = data.map(lambda y: [y[i] for i in (12, 30, 31, 32)])
near = temp.filter(lambda x: x[0]==1) # < 15 min
midd = temp.filter(lambda x: x[0]==2) # 15~30 min
farr = temp.filter(lambda x: x[0]==3) # 30~60 min
vfar = temp.filter(lambda x: x[0]==4) # > 60 min

In [13]:
c = near.count()
print("Number of students whose travel time from school to home is <15 min: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(near, c)

Number of students whose travel time from school to home is <15 min: 366 (56.39%)

For G1, number of students having...
[['Failed', 71, '19.4%'], ['Low Pass', 148, '40.44%'], ['Med Pass', 130, '35.52%'], ['High Pass', 17, '4.64%']]
For G2, number of students having...
[['Failed', 69, '18.85%'], ['Low Pass', 149, '40.71%'], ['Med Pass', 127, '34.7%'], ['High Pass', 21, '5.74%']]
For G3, number of students having...
[['Failed', 51, '13.93%'], ['Low Pass', 140, '38.25%'], ['Med Pass', 143, '39.07%'], ['High Pass', 32, '8.74%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      459|      466|      522|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.25|      1.27|      1.43|
+----------+----------+----------+



In [14]:
c = midd.count()
print("Number of students whose travel time from school to home is 15 min~30 min: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(midd, c)

Number of students whose travel time from school to home is 15 min~30 min: 213 (32.82%)

For G1, number of students having...
[['Failed', 60, '28.17%'], ['Low Pass', 92, '43.19%'], ['Med Pass', 54, '25.35%'], ['High Pass', 7, '3.29%']]
For G2, number of students having...
[['Failed', 53, '24.88%'], ['Low Pass', 92, '43.19%'], ['Med Pass', 55, '25.82%'], ['High Pass', 13, '6.1%']]
For G3, number of students having...
[['Failed', 34, '15.96%'], ['Low Pass', 100, '46.95%'], ['Med Pass', 66, '30.99%'], ['High Pass', 13, '6.1%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      221|      241|      271|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.04|      1.13|      1.27|
+----------+----------+----------+



In [15]:
c = farr.count()
print("Number of students whose travel time from school to home is 30 min~60 min: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(farr, c)

Number of students whose travel time from school to home is 30 min~60 min: 54 (8.32%)

For G1, number of students having...
[['Failed', 21, '38.89%'], ['Low Pass', 19, '35.19%'], ['Med Pass', 14, '25.93%']]
For G2, number of students having...
[['Failed', 16, '29.63%'], ['Low Pass', 23, '42.59%'], ['Med Pass', 14, '25.93%'], ['High Pass', 1, '1.85%']]
For G3, number of students having...
[['Failed', 12, '22.22%'], ['Low Pass', 23, '42.59%'], ['Med Pass', 18, '33.33%'], ['High Pass', 1, '1.85%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       47|       54|       62|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.87|       1.0|      1.15|
+----------+----------+----------+



In [16]:
c = vfar.count()
print("Number of students whose travel time from school to home is over 60 min: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(vfar, c)

Number of students whose travel time from school to home is over 60 min: 16 (2.47%)

For G1, number of students having...
[['Failed', 5, '31.25%'], ['Low Pass', 9, '56.25%'], ['Med Pass', 2, '12.5%']]
For G2, number of students having...
[['Failed', 7, '43.75%'], ['Low Pass', 8, '50.0%'], ['Med Pass', 1, '6.25%']]
For G3, number of students having...
[['Failed', 3, '18.75%'], ['Low Pass', 10, '62.5%'], ['Med Pass', 3, '18.75%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       13|       10|       16|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.81|      0.62|       1.0|
+----------+----------+----------+



Students who live further from school have lower grades than those who live closer from school.

Students who live further from school will have to spend lots of time on the road, and these students may feel tired once they reach home and there may not be enough time for them to revise. Not to mention if there are traffic jams or accidents!

## Part 4: Weekly Study Time

We will try to determine how a student's weekly study time affects a student's grades.

In [17]:
temp = data.map(lambda y: [y[i] for i in (13, 30, 31, 32)])
tl = temp.filter(lambda x: x[0]==1) # < 2 hours
tm = temp.filter(lambda x: x[0]==2) # 2-5 hours
th = temp.filter(lambda x: x[0]==3) # 5~10 hours
tv = temp.filter(lambda x: x[0]==4) # >10 hours

In [18]:
c = tl.count()
print("Number of students who spend less than 2 hours of study per week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(tl, c)

Number of students who spend less than 2 hours of study per week: 212 (32.67%)

For G1, number of students having...
[['Failed', 79, '37.26%'], ['Low Pass', 87, '41.04%'], ['Med Pass', 40, '18.87%'], ['High Pass', 6, '2.83%']]
For G2, number of students having...
[['Failed', 66, '31.13%'], ['Low Pass', 95, '44.81%'], ['Med Pass', 46, '21.7%'], ['High Pass', 5, '2.36%']]
For G3, number of students having...
[['Failed', 50, '23.58%'], ['Low Pass', 105, '49.53%'], ['Med Pass', 50, '23.58%'], ['High Pass', 7, '3.3%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      185|      202|      226|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.87|      0.95|      1.07|
+----------+----------+----------+



In [19]:
c = tm.count()
print("Number of students who spend 2 to 5 hours of study per week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(tm, c)

Number of students who spend 2 to 5 hours of study per week: 305 (47.0%)

For G1, number of students having...
[['Failed', 67, '21.97%'], ['Low Pass', 128, '41.97%'], ['Med Pass', 101, '33.11%'], ['High Pass', 9, '2.95%']]
For G2, number of students having...
[['Failed', 64, '20.98%'], ['Low Pass', 128, '41.97%'], ['Med Pass', 96, '31.48%'], ['High Pass', 17, '5.57%']]
For G3, number of students having...
[['Failed', 41, '13.44%'], ['Low Pass', 125, '40.98%'], ['Med Pass', 118, '38.69%'], ['High Pass', 21, '6.89%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      357|      371|      424|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.17|      1.22|      1.39|
+----------+----------+----------+



In [20]:
c = th.count()
print("Number of students who spend 5 to 10 hours of study per week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(th, c)

Number of students who spend 5 to 10 hours of study per week: 97 (14.95%)

For G1, number of students having...
[['Failed', 9, '9.28%'], ['Low Pass', 39, '40.21%'], ['Med Pass', 45, '46.39%'], ['High Pass', 4, '4.12%']]
For G2, number of students having...
[['Failed', 10, '10.31%'], ['Low Pass', 36, '37.11%'], ['Med Pass', 44, '45.36%'], ['High Pass', 7, '7.22%']]
For G3, number of students having...
[['Failed', 7, '7.22%'], ['Low Pass', 28, '28.87%'], ['Med Pass', 51, '52.58%'], ['High Pass', 11, '11.34%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      141|      145|      163|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.45|      1.49|      1.68|
+----------+----------+----------+



In [21]:
c = tv.count()
print("Number of students who spend more than 10 hours of study per week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(tv, c)

Number of students who spend more than 10 hours of study per week: 35 (5.39%)

For G1, number of students having...
[['Failed', 2, '5.71%'], ['Low Pass', 14, '40.0%'], ['Med Pass', 14, '40.0%'], ['High Pass', 5, '14.29%']]
For G2, number of students having...
[['Failed', 5, '14.29%'], ['Low Pass', 13, '37.14%'], ['Med Pass', 11, '31.43%'], ['High Pass', 6, '17.14%']]
For G3, number of students having...
[['Failed', 2, '5.71%'], ['Low Pass', 15, '42.86%'], ['Med Pass', 11, '31.43%'], ['High Pass', 7, '20.0%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       57|       53|       58|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.63|      1.51|      1.66|
+----------+----------+----------+



Although we do understand that there are some students who study less than two hours per week but score flying colors in the exams, it is pretty evident that the average of students who study for at least 5 hours per week is the highest.

## Part 5: Past Class Failures

We will try to determine whether past class failures do motivate students to do better.

In [22]:
temp = data.map(lambda y: [y[i] for i in (14, 30, 31, 32)])
f0 = temp.filter(lambda x: x[0]==0) # no class failures
f1 = temp.filter(lambda x: x[0]==1) # 1 class failure
f2 = temp.filter(lambda x: x[0]==2) # 2 class failures
f3 = temp.filter(lambda x: x[0]==3) # 3 class failures

In [23]:
c = f0.count()
print("Number of students who have had no class failures: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(f0, c)

Number of students who have had no class failures: 549 (84.59%)

For G1, number of students having...
[['Failed', 87, '15.85%'], ['Low Pass', 242, '44.08%'], ['Med Pass', 196, '35.7%'], ['High Pass', 24, '4.37%']]
For G2, number of students having...
[['Failed', 81, '14.75%'], ['Low Pass', 239, '43.53%'], ['Med Pass', 194, '35.34%'], ['High Pass', 35, '6.38%']]
For G3, number of students having...
[['Failed', 51, '9.29%'], ['Low Pass', 227, '41.35%'], ['Med Pass', 225, '40.98%'], ['High Pass', 46, '8.38%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      706|      732|      815|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.29|      1.33|      1.48|
+----------+----------+----------+



In [24]:
c = f1.count()
print("Number of students who have had 1 class failure: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(f1, c)

Number of students who have had 1 class failure: 70 (10.79%)

For G1, number of students having...
[['Failed', 46, '65.71%'], ['Low Pass', 21, '30.0%'], ['Med Pass', 3, '4.29%']]
For G2, number of students having...
[['Failed', 41, '58.57%'], ['Low Pass', 27, '38.57%'], ['Med Pass', 2, '2.86%']]
For G3, number of students having...
[['Failed', 32, '45.71%'], ['Low Pass', 34, '48.57%'], ['Med Pass', 4, '5.71%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       27|       31|       42|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.39|      0.44|       0.6|
+----------+----------+----------+



In [25]:
c = f2.count()
print("Number of students who have had 2 class failures: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(f2, c)

Number of students who have had 2 class failures: 16 (2.47%)

For G1, number of students having...
[['Failed', 13, '81.25%'], ['Low Pass', 2, '12.5%'], ['Med Pass', 1, '6.25%']]
For G2, number of students having...
[['Failed', 11, '68.75%'], ['Low Pass', 4, '25.0%'], ['Med Pass', 1, '6.25%']]
For G3, number of students having...
[['Failed', 8, '50.0%'], ['Low Pass', 7, '43.75%'], ['Med Pass', 1, '6.25%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|        4|        6|        9|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.25|      0.38|      0.56|
+----------+----------+----------+



In [26]:
c = f3.count()
print("Number of students who have had 3 class failures: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(f3, c)

Number of students who have had 3 class failures: 14 (2.16%)

For G1, number of students having...
[['Failed', 11, '78.57%'], ['Low Pass', 3, '21.43%']]
For G2, number of students having...
[['Failed', 12, '85.71%'], ['Low Pass', 2, '14.29%']]
For G3, number of students having...
[['Failed', 9, '64.29%'], ['Low Pass', 5, '35.71%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|        3|        2|        5|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.21|      0.14|      0.36|
+----------+----------+----------+



Students who have had class failures in the past definitely performed worse than those who have never had class failures.

However, we can generally see a positive improvement on the grades from the first semester through the end of the academic year.

Unfortunately, among students who have had 2 or 3 class failures, the number of students who still failed at the end of the academic year are still a majority.

## Part 6: School Educational Support

We will try to determine whether school educational support does affect a student's grades.

In [27]:
temp = data.map(lambda y: [y[i] for i in (15, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [28]:
c = yes.count()
print("Number of students who receive school educational support: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who receive school educational support: 68 (10.48%)

For G1, number of students having...
[['Failed', 18, '26.47%'], ['Low Pass', 31, '45.59%'], ['Med Pass', 18, '26.47%'], ['High Pass', 1, '1.47%']]
For G2, number of students having...
[['Failed', 14, '20.59%'], ['Low Pass', 36, '52.94%'], ['Med Pass', 17, '25.0%'], ['High Pass', 1, '1.47%']]
For G3, number of students having...
[['Failed', 8, '11.76%'], ['Low Pass', 40, '58.82%'], ['Med Pass', 19, '27.94%'], ['High Pass', 1, '1.47%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       70|       73|       81|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.03|      1.07|      1.19|
+----------+----------+----------+



In [29]:
c = no.count()
print("Number of students who do not receive school educational support: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not receive school educational support: 581 (89.52%)

For G1, number of students having...
[['Failed', 139, '23.92%'], ['Low Pass', 237, '40.79%'], ['Med Pass', 182, '31.33%'], ['High Pass', 23, '3.96%']]
For G2, number of students having...
[['Failed', 131, '22.55%'], ['Low Pass', 236, '40.62%'], ['Med Pass', 180, '30.98%'], ['High Pass', 34, '5.85%']]
For G3, number of students having...
[['Failed', 92, '15.83%'], ['Low Pass', 233, '40.1%'], ['Med Pass', 211, '36.32%'], ['High Pass', 45, '7.75%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      670|      698|      790|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.15|       1.2|      1.36|
+----------+----------+----------+



Students who receive school educational support score slightly lower than those who don't.

## Part 7: Family Educational Support

We will try to determine whether family educational support does affect a student's grades.

In [30]:
temp = data.map(lambda y: [y[i] for i in (16, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [31]:
c = yes.count()
print("Number of students who receive family educational support: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who receive family educational support: 398 (61.33%)

For G1, number of students having...
[['Failed', 89, '22.36%'], ['Low Pass', 173, '43.47%'], ['Med Pass', 121, '30.4%'], ['High Pass', 15, '3.77%']]
For G2, number of students having...
[['Failed', 81, '20.35%'], ['Low Pass', 178, '44.72%'], ['Med Pass', 116, '29.15%'], ['High Pass', 23, '5.78%']]
For G3, number of students having...
[['Failed', 57, '14.32%'], ['Low Pass', 168, '42.21%'], ['Med Pass', 144, '36.18%'], ['High Pass', 29, '7.29%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      460|      479|      543|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.16|       1.2|      1.36|
+----------+----------+----------+



In [32]:
c = no.count()
print("Number of students who do not receive family educational support: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not receive family educational support: 251 (38.67%)

For G1, number of students having...
[['Failed', 68, '27.09%'], ['Low Pass', 95, '37.85%'], ['Med Pass', 79, '31.47%'], ['High Pass', 9, '3.59%']]
For G2, number of students having...
[['Failed', 64, '25.5%'], ['Low Pass', 94, '37.45%'], ['Med Pass', 81, '32.27%'], ['High Pass', 12, '4.78%']]
For G3, number of students having...
[['Failed', 43, '17.13%'], ['Low Pass', 105, '41.83%'], ['Med Pass', 86, '34.26%'], ['High Pass', 17, '6.77%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      280|      292|      328|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.12|      1.16|      1.31|
+----------+----------+----------+



The gap on the average score among those who receive family educational support and those who don't are very small (no more than 0.05), so family educational support is not a significant factor.

## Part 8: Extra Paid Classes

We will try to determine the influence of extra paid classes (e.g. attending cram schools (補習班)) on a student's grades.

In [33]:
temp = data.map(lambda y: [y[i] for i in (17, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [34]:
c = yes.count()
print("Number of students who take extra paid classes: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who take extra paid classes: 39 (6.01%)

For G1, number of students having...
[['Failed', 12, '30.77%'], ['Low Pass', 19, '48.72%'], ['Med Pass', 7, '17.95%'], ['High Pass', 1, '2.56%']]
For G2, number of students having...
[['Failed', 10, '25.64%'], ['Low Pass', 19, '48.72%'], ['Med Pass', 10, '25.64%']]
For G3, number of students having...
[['Failed', 9, '23.08%'], ['Low Pass', 18, '46.15%'], ['Med Pass', 12, '30.77%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       36|       39|       42|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.92|       1.0|      1.08|
+----------+----------+----------+



In [35]:
c = no.count()
print("Number of students who do not take extra paid classes: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not take extra paid classes: 610 (93.99%)

For G1, number of students having...
[['Failed', 145, '23.77%'], ['Low Pass', 249, '40.82%'], ['Med Pass', 193, '31.64%'], ['High Pass', 23, '3.77%']]
For G2, number of students having...
[['Failed', 135, '22.13%'], ['Low Pass', 253, '41.48%'], ['Med Pass', 187, '30.66%'], ['High Pass', 35, '5.74%']]
For G3, number of students having...
[['Failed', 91, '14.92%'], ['Low Pass', 255, '41.8%'], ['Med Pass', 218, '35.74%'], ['High Pass', 46, '7.54%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      704|      732|      829|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.15|       1.2|      1.36|
+----------+----------+----------+



Students who do not take extra paid classes, surprisingly, performed better than those who take extra paid classes.

Possible reasons: Students who take extra paid classes have learning difficulties.

## Part 9: Extracurricular Activities

We will try to determine the influence of extracurricular activities on a student's grades.

In [36]:
temp = data.map(lambda y: [y[i] for i in (18, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [37]:
c = yes.count()
print("Number of students who take extracurricular activities: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who take extracurricular activities: 315 (48.54%)

For G1, number of students having...
[['Failed', 66, '20.95%'], ['Low Pass', 130, '41.27%'], ['Med Pass', 107, '33.97%'], ['High Pass', 12, '3.81%']]
For G2, number of students having...
[['Failed', 61, '19.37%'], ['Low Pass', 129, '40.95%'], ['Med Pass', 110, '34.92%'], ['High Pass', 15, '4.76%']]
For G3, number of students having...
[['Failed', 43, '13.65%'], ['Low Pass', 128, '40.63%'], ['Med Pass', 122, '38.73%'], ['High Pass', 22, '6.98%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      380|      394|      438|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.21|      1.25|      1.39|
+----------+----------+----------+



In [38]:
c = no.count()
print("Number of students who do not take extracurricular activities: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not take extracurricular activities: 334 (51.46%)

For G1, number of students having...
[['Failed', 91, '27.25%'], ['Low Pass', 138, '41.32%'], ['Med Pass', 93, '27.84%'], ['High Pass', 12, '3.59%']]
For G2, number of students having...
[['Failed', 84, '25.15%'], ['Low Pass', 143, '42.81%'], ['Med Pass', 87, '26.05%'], ['High Pass', 20, '5.99%']]
For G3, number of students having...
[['Failed', 57, '17.07%'], ['Low Pass', 145, '43.41%'], ['Med Pass', 108, '32.34%'], ['High Pass', 24, '7.19%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      360|      377|      433|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.08|      1.13|       1.3|
+----------+----------+----------+



Students who take extracurricular activities performed better than those who don't.

In fact, <a href="https://www.kon.org/urc/v5/fujita.html">there have been numerous studies on the effects of extracurricular activities on academic performance</a>, and depending on the choice of extracurricular activity students take, it can positively affect academic performance as the students develop skills that aid in their academics (Fujita, 2006).

## Part 10: Nursery School

We will try to determine whether attending nursery school in the past does affect a student's grades.

In [39]:
temp = data.map(lambda y: [y[i] for i in (19, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [40]:
c = yes.count()
print("Number of students who attended nursery school: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who attended nursery school: 521 (80.28%)

For G1, number of students having...
[['Failed', 122, '23.42%'], ['Low Pass', 212, '40.69%'], ['Med Pass', 170, '32.63%'], ['High Pass', 17, '3.26%']]
For G2, number of students having...
[['Failed', 112, '21.5%'], ['Low Pass', 212, '40.69%'], ['Med Pass', 170, '32.63%'], ['High Pass', 27, '5.18%']]
For G3, number of students having...
[['Failed', 81, '15.55%'], ['Low Pass', 209, '40.12%'], ['Med Pass', 194, '37.24%'], ['High Pass', 37, '7.1%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      603|      633|      708|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.16|      1.21|      1.36|
+----------+----------+----------+



In [41]:
c = no.count()
print("Number of students who did not attend nursery school: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who did not attend nursery school: 128 (19.72%)

For G1, number of students having...
[['Failed', 35, '27.34%'], ['Low Pass', 56, '43.75%'], ['Med Pass', 30, '23.44%'], ['High Pass', 7, '5.47%']]
For G2, number of students having...
[['Failed', 33, '25.78%'], ['Low Pass', 60, '46.88%'], ['Med Pass', 27, '21.09%'], ['High Pass', 8, '6.25%']]
For G3, number of students having...
[['Failed', 19, '14.84%'], ['Low Pass', 64, '50.0%'], ['Med Pass', 36, '28.12%'], ['High Pass', 9, '7.03%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      137|      138|      163|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.07|      1.08|      1.27|
+----------+----------+----------+



Students who attended nursery before perform better those who don't. It is believed that students who attended nursery gave developed skills that would be useful as they go through their schooling years.

## Part 11: Higher Education Plans?

We will try to determine whether a student's desire to go for higher education motivates the student to do better.

In [42]:
temp = data.map(lambda y: [y[i] for i in (20, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [43]:
c = yes.count()
print("Number of students who want to pursue higher education: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who want to pursue higher education: 580 (89.37%)

For G1, number of students having...
[['Failed', 106, '18.28%'], ['Low Pass', 251, '43.28%'], ['Med Pass', 199, '34.31%'], ['High Pass', 24, '4.14%']]
For G2, number of students having...
[['Failed', 103, '17.76%'], ['Low Pass', 248, '42.76%'], ['Med Pass', 194, '33.45%'], ['High Pass', 35, '6.03%']]
For G3, number of students having...
[['Failed', 67, '11.55%'], ['Low Pass', 239, '41.21%'], ['Med Pass', 228, '39.31%'], ['High Pass', 46, '7.93%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      721|      741|      833|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.24|      1.28|      1.44|
+----------+----------+----------+



In [44]:
c = no.count()
print("Number of students who do not want to pursue higher education: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not want to pursue higher education: 69 (10.63%)

For G1, number of students having...
[['Failed', 51, '73.91%'], ['Low Pass', 17, '24.64%'], ['Med Pass', 1, '1.45%']]
For G2, number of students having...
[['Failed', 42, '60.87%'], ['Low Pass', 24, '34.78%'], ['Med Pass', 3, '4.35%']]
For G3, number of students having...
[['Failed', 33, '47.83%'], ['Low Pass', 34, '49.28%'], ['Med Pass', 2, '2.9%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       19|       30|       38|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.28|      0.43|      0.55|
+----------+----------+----------+



Students who want to pursue higher education definitely score higher than those who don't want.

We can tell that students who want to pursue higher education are motivated to do so, therefore, they know that they have to score well in their grades in order for them to be able to go to higher education.

## Part 12: Internet At Home

We will try to determine the influence of Internet access at home on a student's grades.

In [45]:
temp = data.map(lambda y: [y[i] for i in (21, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [46]:
c = yes.count()
print("Number of students who have internet at home: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who have internet at home: 498 (76.73%)

For G1, number of students having...
[['Failed', 106, '21.29%'], ['Low Pass', 208, '41.77%'], ['Med Pass', 165, '33.13%'], ['High Pass', 19, '3.82%']]
For G2, number of students having...
[['Failed', 100, '20.08%'], ['Low Pass', 209, '41.97%'], ['Med Pass', 162, '32.53%'], ['High Pass', 27, '5.42%']]
For G3, number of students having...
[['Failed', 68, '13.65%'], ['Low Pass', 199, '39.96%'], ['Med Pass', 192, '38.55%'], ['High Pass', 39, '7.83%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      595|      614|      700|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.19|      1.23|      1.41|
+----------+----------+----------+



In [47]:
c = no.count()
print("Number of students who do not have internet at home: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who do not have internet at home: 151 (23.27%)

For G1, number of students having...
[['Failed', 51, '33.77%'], ['Low Pass', 60, '39.74%'], ['Med Pass', 35, '23.18%'], ['High Pass', 5, '3.31%']]
For G2, number of students having...
[['Failed', 45, '29.8%'], ['Low Pass', 63, '41.72%'], ['Med Pass', 35, '23.18%'], ['High Pass', 8, '5.3%']]
For G3, number of students having...
[['Failed', 32, '21.19%'], ['Low Pass', 74, '49.01%'], ['Med Pass', 38, '25.17%'], ['High Pass', 7, '4.64%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      145|      157|      171|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.96|      1.04|      1.13|
+----------+----------+----------+



Students who have Internet at home perform better than those who don't.

With the Internet, students have easier and faster access to resources and knowledge right from their home.

## Part 13: Romantic Relationships

We will try to determine how romantic relationships can affect a student's grades.

In [48]:
temp = data.map(lambda y: [y[i] for i in (22, 30, 31, 32)])
yes = temp.filter(lambda x: x[0]==1)
no = temp.filter(lambda x: x[0]==0)

In [49]:
c = yes.count()
print("Number of students who are on a romantic relationship: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(yes, c)

Number of students who are on a romantic relationship: 239 (36.83%)

For G1, number of students having...
[['Failed', 66, '27.62%'], ['Low Pass', 96, '40.17%'], ['Med Pass', 73, '30.54%'], ['High Pass', 4, '1.67%']]
For G2, number of students having...
[['Failed', 63, '26.36%'], ['Low Pass', 98, '41.0%'], ['Med Pass', 72, '30.13%'], ['High Pass', 6, '2.51%']]
For G3, number of students having...
[['Failed', 46, '19.25%'], ['Low Pass', 99, '41.42%'], ['Med Pass', 81, '33.89%'], ['High Pass', 13, '5.44%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      254|      260|      300|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.06|      1.09|      1.26|
+----------+----------+----------+



In [50]:
c = no.count()
print("Number of students who are not on a romantic relationship: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(no, c)

Number of students who are not on a romantic relationship: 410 (63.17%)

For G1, number of students having...
[['Failed', 91, '22.2%'], ['Low Pass', 172, '41.95%'], ['Med Pass', 127, '30.98%'], ['High Pass', 20, '4.88%']]
For G2, number of students having...
[['Failed', 82, '20.0%'], ['Low Pass', 174, '42.44%'], ['Med Pass', 125, '30.49%'], ['High Pass', 29, '7.07%']]
For G3, number of students having...
[['Failed', 54, '13.17%'], ['Low Pass', 174, '42.44%'], ['Med Pass', 149, '36.34%'], ['High Pass', 33, '8.05%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      486|      511|      571|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.19|      1.25|      1.39|
+----------+----------+----------+



Students who are not on a romantic relationship perform better than those who do.

There's a lot to do in a romantic relationship, and students who are in a romantic relationship will have to spend time with their partners, and unfortunately that reduces the time available for them to study.

## Part 14: Quality of Family Relationships

We will try to determine how the quality of family relationships can affect a student's grades.

In [51]:
temp = data.map(lambda y: [y[i] for i in (23, 30, 31, 32)])

threshold = 3

high = temp.filter(lambda x: x[0]>=threshold) # good relationships
low = temp.filter(lambda x: x[0]<threshold)   # bad relationships

In [52]:
c = high.count()
print("Number of students who have good family relationships: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(high, c)

Number of students who have good family relationships: 598 (92.14%)

For G1, number of students having...
[['Failed', 140, '23.41%'], ['Low Pass', 249, '41.64%'], ['Med Pass', 187, '31.27%'], ['High Pass', 22, '3.68%']]
For G2, number of students having...
[['Failed', 126, '21.07%'], ['Low Pass', 254, '42.47%'], ['Med Pass', 184, '30.77%'], ['High Pass', 34, '5.69%']]
For G3, number of students having...
[['Failed', 86, '14.38%'], ['Low Pass', 252, '42.14%'], ['Med Pass', 215, '35.95%'], ['High Pass', 45, '7.53%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      689|      724|      817|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.15|      1.21|      1.37|
+----------+----------+----------+



In [53]:
c = low.count()
print("Number of students who have bad family relationships: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(low, c)

Number of students who have bad family relationships: 51 (7.86%)

For G1, number of students having...
[['Failed', 17, '33.33%'], ['Low Pass', 19, '37.25%'], ['Med Pass', 13, '25.49%'], ['High Pass', 2, '3.92%']]
For G2, number of students having...
[['Failed', 19, '37.25%'], ['Low Pass', 18, '35.29%'], ['Med Pass', 13, '25.49%'], ['High Pass', 1, '1.96%']]
For G3, number of students having...
[['Failed', 14, '27.45%'], ['Low Pass', 21, '41.18%'], ['Med Pass', 15, '29.41%'], ['High Pass', 1, '1.96%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       51|       47|       54|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|       1.0|      0.92|      1.06|
+----------+----------+----------+



Although we have a student or two who manages to score flying colors despite bad family relationships, good family relationships can be considered as a factor for better performance in academics.

It is <a href="https://oureverydaylife.com/poor-relationships-affect-family-12300111.html">evident</a> that students with bad family relationships can develop stress and/or health problems, which in turn affects them academically.

## Part 15: Free Time

We will try to determine how the amount of free time after school can affect a student's grades.

In [54]:
temp = data.map(lambda y: [y[i] for i in (24, 30, 31, 32)])

threshold = 3

high = temp.filter(lambda x: x[0]>=threshold) # lots of free time
low = temp.filter(lambda x: x[0]<threshold)   # not much free time

In [55]:
c = high.count()
print("Number of students who do not have a lot of free time after school: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(high, c)

Number of students who do not have a lot of free time after school: 497 (76.58%)

For G1, number of students having...
[['Failed', 127, '25.55%'], ['Low Pass', 207, '41.65%'], ['Med Pass', 148, '29.78%'], ['High Pass', 15, '3.02%']]
For G2, number of students having...
[['Failed', 119, '23.94%'], ['Low Pass', 209, '42.05%'], ['Med Pass', 144, '28.97%'], ['High Pass', 25, '5.03%']]
For G3, number of students having...
[['Failed', 81, '16.3%'], ['Low Pass', 218, '43.86%'], ['Med Pass', 166, '33.4%'], ['High Pass', 32, '6.44%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      548|      572|      646|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|       1.1|      1.15|       1.3|
+----------+----------+----------+



In [56]:
c = low.count()
print("Number of students who have a lot of free time after school: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(low, c)

Number of students who have a lot of free time after school: 152 (23.42%)

For G1, number of students having...
[['Failed', 30, '19.74%'], ['Low Pass', 61, '40.13%'], ['Med Pass', 52, '34.21%'], ['High Pass', 9, '5.92%']]
For G2, number of students having...
[['Failed', 26, '17.11%'], ['Low Pass', 63, '41.45%'], ['Med Pass', 53, '34.87%'], ['High Pass', 10, '6.58%']]
For G3, number of students having...
[['Failed', 19, '12.5%'], ['Low Pass', 55, '36.18%'], ['Med Pass', 64, '42.11%'], ['High Pass', 14, '9.21%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      192|      199|      225|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.26|      1.31|      1.48|
+----------+----------+----------+



Students who do not have much free time after school perform better than those who have much free time after school.

Possible hypothesis: Students who do not have much free time may probably use the time they have left for studying. However, a fraction of students who have free time do whatever they want that they forget their studies.

## Part 16: Weekly Alcohol Consumption

We will try to determine how alcohol consumption can affect a student's grades.

We assume:
* weekday consists of five days, and weekend consists of two days
* the students' alcohol consumption habits remain constant throughout the year.

In [57]:
# 26 = weekday alcohol consumption, 27 = weekend alcohol consumption
temp = data.map(lambda y: [(y[26]*5+y[27]*2)/7, y[30], y[31], y[32]])

threshold = [4, 2]

high = temp.filter(lambda x: x[0]>=threshold[0])                        # high alcohol consumption
mid = temp.filter(lambda x: x[0]<threshold[0] and x[0]>=threshold[1])   # moderate alcohol consumption
low = temp.filter(lambda x: x[0]<threshold[1] and x[0]>1)               # low alcohol consumption
nul = temp.filter(lambda x: x[0]==1)                                    # no alcohol consumption *at all*

Footnote: The lowest possible value for this factor is 1, so we assume that if a student's weekly alcohol consumption is rated at 1, it means that the student does not consume alcohol at all.

In [58]:
c = high.count()
print("Number of students who consume a high amount of alcohol every week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(high, c)

Number of students who consume a high amount of alcohol every week: 27 (4.16%)

For G1, number of students having...
[['Failed', 11, '40.74%'], ['Low Pass', 12, '44.44%'], ['Med Pass', 4, '14.81%']]
For G2, number of students having...
[['Failed', 6, '22.22%'], ['Low Pass', 19, '70.37%'], ['Med Pass', 2, '7.41%']]
For G3, number of students having...
[['Failed', 8, '29.63%'], ['Low Pass', 15, '55.56%'], ['Med Pass', 4, '14.81%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       20|       23|       23|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.74|      0.85|      0.85|
+----------+----------+----------+



In [59]:
c = mid.count()
print("Number of students who consume a moderate amount of alcohol every week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(mid, c)

Number of students who consume a moderate amount of alcohol every week: 173 (26.66%)

For G1, number of students having...
[['Failed', 52, '30.06%'], ['Low Pass', 79, '45.66%'], ['Med Pass', 35, '20.23%'], ['High Pass', 7, '4.05%']]
For G2, number of students having...
[['Failed', 52, '30.06%'], ['Low Pass', 77, '44.51%'], ['Med Pass', 37, '21.39%'], ['High Pass', 7, '4.05%']]
For G3, number of students having...
[['Failed', 34, '19.65%'], ['Low Pass', 88, '50.87%'], ['Med Pass', 42, '24.28%'], ['High Pass', 9, '5.2%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      170|      172|      199|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.98|      0.99|      1.15|
+----------+----------+----------+



In [60]:
c = low.count()
print("Number of students who consume a low amount of alcohol every week: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(low, c)

Number of students who consume a low amount of alcohol every week: 208 (32.05%)

For G1, number of students having...
[['Failed', 41, '19.71%'], ['Low Pass', 90, '43.27%'], ['Med Pass', 68, '32.69%'], ['High Pass', 9, '4.33%']]
For G2, number of students having...
[['Failed', 42, '20.19%'], ['Low Pass', 84, '40.38%'], ['Med Pass', 68, '32.69%'], ['High Pass', 14, '6.73%']]
For G3, number of students having...
[['Failed', 32, '15.38%'], ['Low Pass', 76, '36.54%'], ['Med Pass', 83, '39.9%'], ['High Pass', 17, '8.17%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      253|      262|      293|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.22|      1.26|      1.41|
+----------+----------+----------+



In [61]:
c = nul.count()
print("Number of students who do not consume alcohol at all: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(nul, c)

Number of students who do not consume alcohol at all: 241 (37.13%)

For G1, number of students having...
[['Failed', 53, '21.99%'], ['Low Pass', 87, '36.1%'], ['Med Pass', 93, '38.59%'], ['High Pass', 8, '3.32%']]
For G2, number of students having...
[['Failed', 45, '18.67%'], ['Low Pass', 92, '38.17%'], ['Med Pass', 90, '37.34%'], ['High Pass', 14, '5.81%']]
For G3, number of students having...
[['Failed', 26, '10.79%'], ['Low Pass', 94, '39.0%'], ['Med Pass', 101, '41.91%'], ['High Pass', 20, '8.3%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      297|      314|      356|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.23|       1.3|      1.48|
+----------+----------+----------+



The more alcohol a student consumes every week, the worse the student performs -- alcohol affects the human body's systems and if consumed excessively, harm one's health physically and mentally. Students who consume lots of alcohol every week may not be able to think properly, and this will definitely affect their studies.

Footnote: More weightage is placed on weekday alcohol consumption, because students are supposed to go to school and attend classes during the weekday -- therefore it is bad if a student has a high weekday alcohol consumption!

## Part 17: Health Condition

We will try to determine how the health condition of a student can affect a student's grades.

In [62]:
temp = data.map(lambda y: [y[i] for i in (28, 30, 31, 32)])

threshold = 3

high = temp.filter(lambda x: x[0]>=threshold) # good health condition
low = temp.filter(lambda x: x[0]<threshold)   # bad health condition

In [63]:
c = high.count()
print("Number of students who have good health condition: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(high, c)

Number of students who have good health condition: 481 (74.11%)

For G1, number of students having...
[['Failed', 116, '24.12%'], ['Low Pass', 212, '44.07%'], ['Med Pass', 136, '28.27%'], ['High Pass', 17, '3.53%']]
For G2, number of students having...
[['Failed', 107, '22.25%'], ['Low Pass', 221, '45.95%'], ['Med Pass', 130, '27.03%'], ['High Pass', 23, '4.78%']]
For G3, number of students having...
[['Failed', 74, '15.38%'], ['Low Pass', 217, '45.11%'], ['Med Pass', 158, '32.85%'], ['High Pass', 32, '6.65%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      535|      550|      629|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.11|      1.14|      1.31|
+----------+----------+----------+



In [64]:
c = low.count()
print("Number of students who have bad health condition: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(low, c)

Number of students who have bad health condition: 168 (25.89%)

For G1, number of students having...
[['Failed', 41, '24.4%'], ['Low Pass', 56, '33.33%'], ['Med Pass', 64, '38.1%'], ['High Pass', 7, '4.17%']]
For G2, number of students having...
[['Failed', 38, '22.62%'], ['Low Pass', 51, '30.36%'], ['Med Pass', 67, '39.88%'], ['High Pass', 12, '7.14%']]
For G3, number of students having...
[['Failed', 26, '15.48%'], ['Low Pass', 56, '33.33%'], ['Med Pass', 72, '42.86%'], ['High Pass', 14, '8.33%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      205|      221|      242|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.22|      1.32|      1.44|
+----------+----------+----------+



Based on average, we can find that students with bad health condition generally perform better than those with good health condition.

Possible hypothesis: Sleep is an important factor in human health, and <a href="https://www.livescience.com/55021-sleep-school-age-kids.html">high school students tend to sleep less because of numerous demands, including academic demands</a>. This may contibute to poor health condition among some students.

## Part 18: Absences

We will try to determine how the number of absent days can affect a student's grades.

In [65]:
temp = data.map(lambda y: [y[i] for i in (29, 30, 31, 32)])

threshold = [25, 13]

high = temp.filter(lambda x: x[0]>=threshold[0])                        # high amount of absent days (>= 25 days)
mid = temp.filter(lambda x: x[0]<threshold[0] and x[0]>=threshold[1])   # medium amount of absent days (>= 13 days)
low = temp.filter(lambda x: x[0]<threshold[1] and x[0]>0)               # low amount of absent days
nul = temp.filter(lambda x: x[0]==0)                                    # no absences

In [66]:
c = high.count()
print("Number of students who have a high amount of absent days: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(high, c)

Number of students who have a high amount of absent days: 3 (0.46%)

For G1, number of students having...
[['Failed', 1, '33.33%'], ['Med Pass', 2, '66.67%']]
For G2, number of students having...
[['Failed', 1, '33.33%'], ['Med Pass', 2, '66.67%']]
For G3, number of students having...
[['Failed', 1, '33.33%'], ['Med Pass', 2, '66.67%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|        4|        4|        4|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.33|      1.33|      1.33|
+----------+----------+----------+



In [67]:
high.collect() # try to collect the three students' data with high amount of absent days

[[32, 2, 2, 2], [30, 2, 2, 2], [26, 0, 0, 0]]

In [68]:
c = mid.count()
print("Number of students who have a medium amount of absent days: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(mid, c)

Number of students who have a medium amount of absent days: 29 (4.47%)

For G1, number of students having...
[['Failed', 10, '34.48%'], ['Low Pass', 14, '48.28%'], ['Med Pass', 5, '17.24%']]
For G2, number of students having...
[['Failed', 11, '37.93%'], ['Low Pass', 14, '48.28%'], ['Med Pass', 4, '13.79%']]
For G3, number of students having...
[['Failed', 7, '24.14%'], ['Low Pass', 17, '58.62%'], ['Med Pass', 4, '13.79%'], ['High Pass', 1, '3.45%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|       24|       22|       28|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      0.83|      0.76|      0.97|
+----------+----------+----------+



In [69]:
c = low.count()
print("Number of students who have a low amount of absent days: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(low, c)

Number of students who have a low amount of absent days: 373 (57.47%)

For G1, number of students having...
[['Failed', 101, '27.08%'], ['Low Pass', 157, '42.09%'], ['Med Pass', 108, '28.95%'], ['High Pass', 7, '1.88%']]
For G2, number of students having...
[['Failed', 91, '24.4%'], ['Low Pass', 161, '43.16%'], ['Med Pass', 110, '29.49%'], ['High Pass', 11, '2.95%']]
For G3, number of students having...
[['Failed', 58, '15.55%'], ['Low Pass', 171, '45.84%'], ['Med Pass', 124, '33.24%'], ['High Pass', 20, '5.36%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      394|      414|      479|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|      1.06|      1.11|      1.28|
+----------+----------+----------+



In [70]:
c = nul.count()
print("Number of students who are never absent: ", c, " (", percent(c, data.count()), ")", sep='')
print()

splitdata(nul, c)

Number of students who are never absent: 244 (37.6%)

For G1, number of students having...
[['Failed', 45, '18.44%'], ['Low Pass', 97, '39.75%'], ['Med Pass', 85, '34.84%'], ['High Pass', 17, '6.97%']]
For G2, number of students having...
[['Failed', 42, '17.21%'], ['Low Pass', 97, '39.75%'], ['Med Pass', 81, '33.2%'], ['High Pass', 24, '9.84%']]
For G3, number of students having...
[['Failed', 34, '13.93%'], ['Low Pass', 85, '34.84%'], ['Med Pass', 100, '40.98%'], ['High Pass', 25, '10.25%']]
+---------+---------+---------+
|G1 points|G2 points|G3 points|
+---------+---------+---------+
|      318|      331|      360|
+---------+---------+---------+

+----------+----------+----------+
|G1 average|G2 average|G3 average|
+----------+----------+----------+
|       1.3|      1.36|      1.48|
+----------+----------+----------+



<a href="https://www.attendanceworks.org/chronic-absence/the-problem/10-facts-about-school-attendance/">Theory: Students with chronic absence records tend to perform worse than those with lighter absence records or those with no absence records.</a>

Although it is true that students with worse absence records score lower average grades, there are 3 students whose number of absent days are considered high; however the average grade for these three students are almost the same as those without absences, despite one of the three students failing.

Possible reasons:
* The student with a high amount of absent days may be a public figure or a professional athlete
* The student with a high amount of absent days may have a major event in life that resulted in that student's absence for many days

## Conclusions and Future Work

There are many factors which affect students' academic performance.
Since we have 33 attributes in the dataset, we can dig out a lot of information regarding students' performance.
In order for students to perform well academically, students have to be in perfect condition -- physically and mentally.

Possible future work: If we have the data like this regarding students in Taiwan, we can use that and figure out the factors that affect students in Taiwan.

## References

### Dataset
Student Performance Data Set by Paulo Cortez, University of Minho, Portugal. Retrieved from<br>
https://archive.ics.uci.edu/ml/datasets/student%2Bperformance

### Other information
* Fujita, K. (2006). The Effects of Extracurricular Activities on the Academic Performance of Junior High Students. Undergraduate Research Journal for the Human Sciences, 5.<br>
https://www.kon.org/urc/v5/fujita.html
* How Poor Relationships Affect the Family. Retrieved from<br>
https://oureverydaylife.com/poor-relationships-affect-family-12300111.html
* 10 Facts About School Attendance. Retrieved from<br>
https://www.attendanceworks.org/chronic-absence/the-problem/10-facts-about-school-attendance/