In [1]:


import string
import random
import math

import numpy as np

### Question 1:Student Grades (60 points)

You are given a csv file of student grades with some meta-data. You are asked to calculate the final point grade of the students. The format of the given file is as follows:
* The first line contains the name of the course
* The second line contains the number of students
* The third line contains the text "ID" and names of the graded work (homeworks, midterm exams and the final exam), all of which are separated with commas. The details will be given later
* The remaining lines contain the information corresponding to the information given in the third line in a comma separated list.

A simple example is given below (also in files/simple.txt)  

    UNIV111  
    5  
    ID,HW1,HW2,MT,Final  
    7749,76.,55.,62.,73.  
    6659,44.,78.,31.,67.  
    1583,25.,85,86,30.  
    1023,46.,26.,75.,31.  
    1479,78.,52.,82.,40.  

**Part I:** Read the File (25 points)

The column names, aside from the id, given in the third line of the file dictate the grade type. These are as follows:
* All the names that start with `'HW'` are homeworks. There may be multiple homeworks. They will always have unique names.
* All the names that start with `'MT'` are midterms. There may be multiple exams. They will always have unique names.
* The name `'Final'` is the final. There is always one final.

Implement this in the `read_grades` function below. You do not need to do any error handling.


In [1]:
def read_grades(path_to_file):
    """
    Inputs: 
    path_to_file: The path to the grades file with the format defined above
    
    Outputs: 
    course_name: The name of the course
    ids: The numpy array containing student ids. Type 'int32'
    hws: The numpy array containing the homework grades. 
         Shape depends on the number of homework columns. Type 'float64'
    mts: The numpy array containing the midterm grades
         Shape depends on the number of midterm columns. Type 'float64'
    f: The numpy array containing the final grades. Type 'float64'
    """
    
    # YOUR CODE HERE
    with open(path_to_file,'r') as file:
        course_name = file.readline().rstrip()
        num_students = int(file.readline().rstrip())
        column_names = file.readline().rstrip().split(',')
        num_columns = len(column_names)

        num_hws = 0
        num_mts = 0
        
        for name in column_names:
            if name[:2] == 'HW':
                num_hws += 1
            if name[:2] == 'MT':
                num_mts += 1
        
        ids = np.empty((num_students,),dtype='int32')
        f = np.empty((num_students,))
        
        hws = np.empty((num_students,num_hws))
        mts = np.empty((num_students,num_mts))
        
        current_row = 0
        for line in file:
            current_col = 0
            values = line.rstrip().split(',')
            ids[current_row] = values[current_col]
            current_col += 1
            
            for i in range(num_hws):
                hws[current_row,i] = values[current_col]
                current_col += 1
            
            for i in range(num_mts):
                mts[current_row,i] = values[current_col]
                current_col += 1
            
            f[current_row] = values[current_col]
            
            
            current_row += 1

        hws = hws.squeeze()
        mts = mts.squeeze()
        
    
    return course_name, ids, hws, mts, f

In [2]:
course_name, ids, hws, mts, f = read_grades('./files/simple.txt')

cnDebug = 'UNIV111'
idDebug = np.array([7749, 6659, 1583, 1023, 1479])
hwDebug = np.array([[76., 55.],
                    [44., 78.],
                    [25., 85.],
                    [46., 26.],
                    [78., 52.]])
mtDebug = np.array([62., 31., 86., 75., 82.]) #could also be 2D, in fact needs to be 2D if there are more than 1 midterm
fDebug = np.array([73., 67., 30., 31., 40.])

FileNotFoundError: [Errno 2] No such file or directory: './files/simple.txt'

In [4]:
print(cnDebug==course_name)
print(((ids-idDebug) == 0).all())
print(((hws-hwDebug) == 0).all())
print(((mts-mtDebug) == 0).all())
print(((f-fDebug) == 0).all())

True
True
True
True
True


**Part II:** Calculate the Weighted Average (10 points)

The weighted average is calculated as follows:  
$Weighted Average = \alpha_{hw} \cdot \mu_{HW} + \alpha_{mt} \cdot \mu_{MT} + \alpha_{f} \cdot  F$  

where $\mu_{HW}$ is the average of the homeworks, $\mu_{MT}$ is the average of the midterms $F$ is the final grade and $\alpha = [\alpha_{hw} \alpha_{mt} \alpha_{f}]$ is their respective weights. 

Implement this in the `WeightedAverage` function below. 

(We will supply our own data so do not worry if you were not able to do Part I)

In [5]:
def WeightedAverage(grades, alpha = [0.25,0.3,0.45]):
    """
    This function calculates the weighted average of all the graded items.
    
    Inputs: 
    grades: The container of homeworks, midterm and final grades. 
    alpha: A list or 1D NumPy array of size 3 with the weights.
           The order is homework weight, midterm weight and final weight
           
           
    Outputs:
    weighted: 1D NumPy array of the weighted average of the grades as calculated above.
    The indices of the student_ids and grades must match
    """
    
    # YOUR CODE HERE
    if len(grades[0].shape) == 1:
        muHW = grades[0]
    else:
        muHW = grades[0].mean(axis=1)
        
    if len(grades[1].shape) == 1:
        muMT = grades[1]
    else:
        muMT = grades[1].mean(axis=1)
        
    weighted = muHW*alpha[0] + muMT*alpha[1] + grades[2]*alpha[2]
            
    return weighted
    

In [6]:
hws = hwDebug
mts = mtDebug
f = fDebug
weighted = WeightedAverage((hws, mts, f), [0.3,0.35,0.35])

weightedDebug = [66.9, 52.6 , 57.1 , 47.9 , 62.2]

print(weighted-weightedDebug)

[-1.42108547e-14 -7.10542736e-15 -7.10542736e-15  0.00000000e+00
  0.00000000e+00]


In [7]:
# A bit more involved:
course_name2, ids2, hws2, mts2, f2= read_grades('./files/comp101.txt')
weighted2 = WeightedAverage((hws2, mts2, f2), [0.25,0.35,0.4])

Look at this [spreadsheet](https://docs.google.com/spreadsheets/d/1q792k5Ltk1i17BtDA5OO2oVOI8iy28ghezn5x1JHsgU/edit?usp=sharing) for debugging

**Part III:** Summary Output (25 points)

You are asked to write the grades summary in a file. The format is as follows:

    Course Name: <course_name>
    Number of Enrolled Students: <num_students>
    Student with the Highest Grade: <highest_id>
    Student with the Lowest Grade: <lowest_id>

    Weighted Average
    - Minimum Grade: <min_wavg_grade>
    - Maximum Grade: <max_wavg_grade>
    - Average Grade: <avg_wavg_grade>
    - Median Grade: <med_wavg_grade>

    HW<i>
    - Minimum Grade: <min_hwi_grade>
    - Maximum Grade: <max_hwi_grade>
    - Average Grade: <avg_hwi_grade>
    - Median Grade: <med_hwi_grade>

    MT<i>
    - Minimum Grade: <min_mti_grade>
    - Maximum Grade: <max_mti_grade>
    - Average Grade: <avg_mti_grade>
    - Median Grade: <med_mti_grade>

    Final
    - Minimum Grade: <min_fin_grade>
    - Maximum Grade: <max_fin_grade>
    - Average Grade: <avg_fin_grade>
    - Median Grade: <med_fin_grade>
    
The text in angle brackets are taken or calculated from the grades. All the grade number should be **rounded to 2 decimal places**. The HW and MT blocks are repeated as many times as needed. An example is given to you in the files/simpleSummaryExpected.txt.

Implement the code to write this output in the `summaryOutput` function below.

(We will supply our own data so do not worry if you were not able to do Part I or II. Use the given debugging variables above to test your code in this case)

In [8]:
def getBlockText(title, data):
    min_grade = data.min()
    max_grade = data.max()
    avg_grade = data.mean()
    med_grade = np.median(data)
    
    return f"""{title}
- Minimum Grade: {min_grade:.2f}
- Maximum Grade: {max_grade:.2f}
- Average Grade: {avg_grade:.2f}
- Median Grade: {med_grade:.2f}"""

def writeMultiples(baseTitle, data, file):
    singleRun = True
    if data.ndim > 1:
        if data.shape[1] > 1:
            singleRun = False
    if singleRun:
        file.write(getBlockText(baseTitle, data))
    else:
        for i in range(data.shape[1]):
            file.write(getBlockText(f"{baseTitle}{i}", data[i,:]))
            if i != data.shape[1]-1:
                file.write("\n\n")

# End of file newlines will not matter
def summaryOutput(path_to_file, course_name, ids, hws, mts, f, weighted_average):
    """
    This function writes the summary of the grades to a given file.
    
    Inputs:
    path_to_file: The desired path of the output file
    course_name, ids, hws, mts, f, weighted_average: are all as defined previously in the notebook
    
    Outputs:
    None 
    """
    
    num_students = f.shape[0]
    highest_id = ids[np.argmax(weighted_average)]
    lowest_id = ids[np.argmin(weighted_average)]
    
    headerBoilerPlate = f"""Course Name: {course_name}
Number of Enrolled Students: {num_students}
Student with the Highest Grade: {highest_id}
Student with the Lowest Grade: {lowest_id}
"""
    
    # YOUR CODE HERE
    with open(path_to_file, 'w') as summary:
        summary.write(headerBoilerPlate+"\n")        
        writeMultiples("Weighted Average", weighted_average, summary)
        summary.write("\n\n")
        writeMultiples('HW', hws, summary)
        summary.write("\n\n")
        writeMultiples('MT', mts, summary)
        summary.write("\n\n")
        writeMultiples("Final",f, summary)
    
    return None

In [9]:
course_name = cnDebug
ids = idDebug
summaryOutput('./files/simpleSummary.txt', course_name, ids, hws, mts, f, weighted)

In [10]:
with open('./files/simpleSummaryExpected.txt') as tmp:
    simpleSummaryExpected = tmp.read()
with open('./files/simpleSummary.txt') as tmp:
    simpleSummary = tmp.read()

In [11]:
print(simpleSummaryExpected==simpleSummary)

False


In [12]:
print(simpleSummary)

Course Name: UNIV111
Number of Enrolled Students: 5
Student with the Highest Grade: 7749
Student with the Lowest Grade: 1023

Weighted Average
- Minimum Grade: 47.90
- Maximum Grade: 66.90
- Average Grade: 57.34
- Median Grade: 57.10

HW0
- Minimum Grade: 55.00
- Maximum Grade: 76.00
- Average Grade: 65.50
- Median Grade: 65.50

HW1
- Minimum Grade: 44.00
- Maximum Grade: 78.00
- Average Grade: 61.00
- Median Grade: 61.00

MT
- Minimum Grade: 31.00
- Maximum Grade: 86.00
- Average Grade: 67.20
- Median Grade: 75.00

Final
- Minimum Grade: 30.00
- Maximum Grade: 73.00
- Average Grade: 48.20
- Median Grade: 40.00


In [13]:
print(simpleSummaryExpected)

Course Name: UNIV111
Number of Enrolled Students: 5
Student with the Highest Grade: 7749
Student with the Lowest Grade: 1023

Weighted Average
- Minimum Grade: 47.9
- Maximum Grade: 66.9
- Average Grade: 57.34
- Median Grade: 57.1

HW1
- Minimum Grade: 25.0
- Maximum Grade: 78.0
- Average Grade: 53.8
- Median Grade: 46.0

HW2
- Minimum Grade: 26.0
- Maximum Grade: 78.0
- Average Grade: 59.2
- Median Grade: 55.0

MT
- Minimum Grade: 31.0
- Maximum Grade: 86.0
- Average Grade: 67.2
- Median Grade: 75.0

Final
- Minimum Grade: 30.0
- Maximum Grade: 73.0
- Average Grade: 48.2
- Median Grade: 40.0



The previously given spreadsheet also has the statistics for the Comp101.txt, we recommend you check it as well.

### Question 2: Data Preprocessing (40 points)

You are asked to implement several data preprocessing approaches:
* Data Clipping: If a data point is smaller/larger than a lower/higher threshold, clip the data to the threshold.
* Min-Max Scaling: Mapping the data between a given range linearly
* Median Filling: Replace any missing data with the median of the non-missing data
* randomSelection: Selecting a random percentage of data

We are going to assume that the given data is a 2D NumPy array of shape (n,d) where n is the number of points and d is the dimensionality of each data point (e.g. number of features, number of observations etc.)

You do not need to do any error handling. However, debugging is left entirely upto you!

In [14]:
# WARNING: We added a parameter to signal whether to preserve the original or change it
# We do not care about this for grading. 
# However, the ability to have this option is important in real applications!
# 5 points
def dataClipping(data, lower, higher, columnID=None, inPlace = False):
    """
    This function clips the given data between the lower and higher thresholds
    A 1D example:
    Let input = [-10,5,15], lower = -4, higher = 6
    Then output = [-4,5,6]
    
    Inputs:
    data: A 2D numpy array of shape (n,d)
    lower: Lower threshold
    higher: Higher threshold
    columnID: The column to apply clipping to. If None, apply clipping to all the columns
    
    Outputs:
    dataP: The clipped version of data, still a 2D numpy array of shape (n,d)
    """
    
    # YOUR CODE HERE
    if not inPlace:
        data = data.copy()
    # Using conditional indexing!
    if columnID:
        lowerInds = data[:,columnID] < lower
        higherInds = data[:,columnID] > higher
        data[lowerInds,columnID] = lower
        data[higherInds,columnID] = higher
    else:
        data[data < lower] = lower
        data[data > higher] = higher
    
    return data

In [15]:
data = np.random.randint(0,10,(10,3))
print('Original Data')
print(data)
print(data.min(),data.max())
print('Clipped Data')
clipped = dataClipping(data,3,6)
print(clipped)
print(clipped.min(),clipped.max())

Original Data
[[1 6 9]
 [9 3 9]
 [0 2 8]
 [4 6 3]
 [1 3 9]
 [2 6 3]
 [6 3 3]
 [5 5 2]
 [4 6 1]
 [5 3 7]]
0 9
Clipped Data
[[3 6 6]
 [6 3 6]
 [3 3 6]
 [4 6 3]
 [3 3 6]
 [3 6 3]
 [6 3 3]
 [5 5 3]
 [4 6 3]
 [5 3 6]]
3 6


In [16]:
print('Original Data')
print(data)
print('Only column 1 clipped data')
clippedC1 = dataClipping(data,3,6,1)
print(clippedC1)
print(clippedC1.min(axis=0),clippedC1.max(axis=0))

Original Data
[[1 6 9]
 [9 3 9]
 [0 2 8]
 [4 6 3]
 [1 3 9]
 [2 6 3]
 [6 3 3]
 [5 5 2]
 [4 6 1]
 [5 3 7]]
Only column 1 clipped data
[[1 6 9]
 [9 3 9]
 [0 3 8]
 [4 6 3]
 [1 3 9]
 [2 6 3]
 [6 3 3]
 [5 5 2]
 [4 6 1]
 [5 3 7]]
[0 3 1] [9 6 9]


In [17]:
# 10 points
def minMaxScaling(data, lower, higher, inPlace = False):
    """
    This function scales each column of the data to be between lower and higher.
    A 1D example:
    Let input = [-10,5,15], lower = 0, higher = 1
    Then output = [0,0.6,1]
    
    Math is (data-min)/(max-min)
    
    Inputs:
    data: A 2D numpy array of shape (n,d)
    lower: Lower bound
    higher: Upper bound
    
    Outputs:
    dataP: The scaled version of the data, still a 2D numpy array of shape (n,d)
    
    WARNING: This is per column so do not use min/max of one column for the other
    """
    
    # YOUR CODE HERE
    # math is ((data-min)/(max-min))*(higher-lower) + lower
    minCols = data.min(axis=0)
    maxCols = data.max(axis=0)
    if not inPlace:
        data = data.copy()
    data = (data-minCols)/(maxCols-minCols)*(higher-lower)+lower
    
    return data

In [18]:
data = np.random.random((10,3))*100-50

print('Original Data')
print(data)
print(data.min(axis=0),data.max(axis=0))
print()

mnsData = minMaxScaling(data, 0, 1)
print('Scaled Data 0,1')
print(mnsData)
print(mnsData.min(axis=0),mnsData.max(axis=0))

Original Data
[[-15.73250839 -28.91043813 -23.72599306]
 [ 43.6698752   34.95559175  29.52457098]
 [-35.02262154   0.92580988  -1.22570393]
 [-21.7217304   31.06830112  45.23703686]
 [ 37.18133633  33.30556326 -48.2483868 ]
 [-22.04605677   8.38484942   9.97095271]
 [-17.10381454  45.98727975 -12.51094452]
 [-24.16846729  -5.13044081 -14.86310645]
 [ 33.21113373 -40.84292305  45.38070849]
 [ -5.92291273  22.95326278 -21.6353954 ]]
[-35.02262154 -40.84292305 -48.2483868 ] [43.6698752  45.98727975 45.38070849]

Scaled Data 0,1
[[0.24513281 0.13742321 0.26190997]
 [1.         0.87295103 0.83064946]
 [0.         0.48103922 0.50222298]
 [0.16902363 0.82818215 0.99846552]
 [0.91754565 0.8539481  0.        ]
 [0.16490219 0.56694296 0.6218082 ]
 [0.22770668 1.         0.38169163]
 [0.13793125 0.41129101 0.35656951]
 [0.86709354 0.         1.        ]
 [0.36979013 0.73472345 0.28423848]]
[0. 0. 0.] [1. 1. 1.]


In [19]:
print('Original Data')
print(data)
print(data.min(axis=0),data.max(axis=0))
print()

mnsData = minMaxScaling(data, -1, 1)
print('Scaled Data -1,1')
print(mnsData)
print(mnsData.min(axis=0),mnsData.max(axis=0))

Original Data
[[-15.73250839 -28.91043813 -23.72599306]
 [ 43.6698752   34.95559175  29.52457098]
 [-35.02262154   0.92580988  -1.22570393]
 [-21.7217304   31.06830112  45.23703686]
 [ 37.18133633  33.30556326 -48.2483868 ]
 [-22.04605677   8.38484942   9.97095271]
 [-17.10381454  45.98727975 -12.51094452]
 [-24.16846729  -5.13044081 -14.86310645]
 [ 33.21113373 -40.84292305  45.38070849]
 [ -5.92291273  22.95326278 -21.6353954 ]]
[-35.02262154 -40.84292305 -48.2483868 ] [43.6698752  45.98727975 45.38070849]

Scaled Data -1,1
[[-0.50973437 -0.72515359 -0.47618006]
 [ 1.          0.74590206  0.66129893]
 [-1.         -0.03792156  0.00444595]
 [-0.66195275  0.6563643   0.99693105]
 [ 0.83509129  0.70789619 -1.        ]
 [-0.67019563  0.13388593  0.24361641]
 [-0.54458664  1.         -0.23661673]
 [-0.7241375  -0.17741797 -0.28686099]
 [ 0.73418707 -1.          1.        ]
 [-0.26041973  0.46944689 -0.43152305]]
[-1. -1. -1.] [1. 1. 1.]


In [20]:
print('Original Data')
print(data)
print(data.min(axis=0),data.max(axis=0))
print()

mnsData = minMaxScaling(data, 3, 5)
print('Scaled Data 3,5')
print(mnsData)
print(mnsData.min(axis=0),mnsData.max(axis=0))

Original Data
[[-15.73250839 -28.91043813 -23.72599306]
 [ 43.6698752   34.95559175  29.52457098]
 [-35.02262154   0.92580988  -1.22570393]
 [-21.7217304   31.06830112  45.23703686]
 [ 37.18133633  33.30556326 -48.2483868 ]
 [-22.04605677   8.38484942   9.97095271]
 [-17.10381454  45.98727975 -12.51094452]
 [-24.16846729  -5.13044081 -14.86310645]
 [ 33.21113373 -40.84292305  45.38070849]
 [ -5.92291273  22.95326278 -21.6353954 ]]
[-35.02262154 -40.84292305 -48.2483868 ] [43.6698752  45.98727975 45.38070849]

Scaled Data 3,5
[[3.49026563 3.27484641 3.52381994]
 [5.         4.74590206 4.66129893]
 [3.         3.96207844 4.00444595]
 [3.33804725 4.6563643  4.99693105]
 [4.83509129 4.70789619 3.        ]
 [3.32980437 4.13388593 4.24361641]
 [3.45541336 5.         3.76338327]
 [3.2758625  3.82258203 3.71313901]
 [4.73418707 3.         5.        ]
 [3.73958027 4.46944689 3.56847695]]
[3. 3. 3.] [5. 5. 5.]


In [21]:
# Some ambiguity, I will accept both the entire median and just the column median
# Solution is column by column

# 15 points
def fillWithMedian(data, inPlace = False):
    """
    This function replaces the missing data (given as np.nan) with the median of the rest of the data
    A 1D example:
    Let input = [-10,np.nan,15,5]
    Then output = [-10,5,15,5]
        
    Inputs:
    data: A 2D numpy array of shape (n,d)
    lower: Lower bound
    higher: Upper bound
    
    Outputs:
    dataP: The filled version of the data, still a 2D numpy array of shape (n,d)
    
    WARNINGs: 
    - This is per column so do not mix the median of one column with the other
    - There maybe more than 1 nan in a column!
    
    """
    
    # YOUR CODE HERE
    if not inPlace:
        data = data.copy()
        
    n,d=data.shape
    for i in range(d):
        nanInds = np.isnan(data[:,i])
        regInds = np.logical_not(nanInds)
        data[nanInds,i] = np.median(data[regInds,i])
    
    return data
    

In [22]:
data = np.random.randint(0,10,(10,3)).astype('float64')

k = 5
while k > 0:
    k -= 1
    i = np.random.randint(0,10)
    j = np.random.randint(0,3)
    data[i,j] = np.nan

print('Original Data')
print(data)
print()

filledData = fillWithMedian(data)
print('NaNs filled Data')
print(filledData)

Original Data
[[ 4.  8.  9.]
 [ 6.  2.  5.]
 [ 2.  2.  5.]
 [ 5.  8.  1.]
 [ 2. nan  6.]
 [ 9.  0. nan]
 [ 3. nan nan]
 [ 2.  9.  4.]
 [ 5.  0.  4.]
 [ 3.  3. nan]]

NaNs filled Data
[[4.  8.  9. ]
 [6.  2.  5. ]
 [2.  2.  5. ]
 [5.  8.  1. ]
 [2.  2.5 6. ]
 [9.  0.  5. ]
 [3.  2.5 5. ]
 [2.  9.  4. ]
 [5.  0.  4. ]
 [3.  3.  5. ]]


In [23]:
# THIS QUESTION HAD ITS OWN ISSUES. I AM PROVIDING A SOLUTION FOR MY ORIGINAL INTENT 
# HOWEVER, WE WILL GRADE IT BASED ON HOW YOU MAY HAVE UNDERSTOOD IT
# We will let multiple intrepretations get full grades!
# 10 points
def randomSelection(data, ratio):
    """
    This function selects a random portion of the data. The amount of data to be randomly picked is determined by ratio.
    The random selection is done on the rows and not on the columns!
    
    A 1D example:
    Let input = [-10,15,5] and ratio = 1/3
    Then output = [5] (assuming 5 was randomly selected)
    
    A way to do random selection: Generate random row indices
    
    Inputs:
    data: A 2D numpy array of shape (n,d)
    ratio: Ratio of data to be selected randomly
    
    Outputs:
    dataS: The randomly selected data pf shape (n*ratio, d)
    
    Note: Round d*ratio to the nearest integer using round(n*ratio)
    """
    
    # YOUR CODE HERE
    n,d = data.shape
    k = round(n*ratio)
    inds = np.random.randint(0,n,k)
    print(k)
    return data[inds,:]

In [24]:
data = np.random.random((10,3))
dataRand = randomSelection(data,0.4)
print(data.shape)
print(dataRand.shape)
print()
print(data)
print()
print(dataRand)

4
(10, 3)
(4, 3)

[[0.7929979  0.84242827 0.88219308]
 [0.00506825 0.4367491  0.15616721]
 [0.99284149 0.91414038 0.1305127 ]
 [0.74167449 0.1611541  0.89365708]
 [0.41303669 0.03071142 0.87889626]
 [0.29340994 0.21365833 0.98904565]
 [0.60615315 0.62373057 0.30231672]
 [0.42890468 0.49821168 0.30299432]
 [0.42011129 0.94105531 0.66807365]
 [0.05035005 0.96189965 0.95397826]]

[[0.42890468 0.49821168 0.30299432]
 [0.41303669 0.03071142 0.87889626]
 [0.60615315 0.62373057 0.30231672]
 [0.05035005 0.96189965 0.95397826]]
