# Lab 1: Python Refresher

---
# Markdown Cell Review

HELLO **THIS IS BOLD TEXT**

# THIS IS A TITLE

## I'M A SUBTITLE

*Now in italics*

```
1 + 1
```

$
\sum_i^n x^i 
$

$$
\sum_i^n x^i 
$$

[Click here for a markdown cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Here-Cheatsheet)

---
# If you have no coding experience, [click here for a tutorial on programming](https://www.kaggle.com/learn/intro-to-programming)

---
# [Click here to access the Python official tutorial](https://docs.python.org/3.12/tutorial/index.html)

---
# [Click here for a nice Python tutorial](https://www.kaggle.com/learn/python)

---
# Current Work Directory

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'seed' is predefined in the notebook. If not, define a seed for reproducibility
seed = 42  # Replace with the notebook's random seed

# Load the dataset
hockey_df = pd.read_csv('/mnt/data/hockey.csv')  # Adjust the path as needed

# Preparing the data
X = hockey_df.drop('icetime', axis=1)
y = hockey_df['icetime']

# Encoding categorical variables with OneHotEncoder, handling unknown categories
categorical_features = ['opposingTeam', 'home_or_away']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ], remainder='passthrough')

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

# Linear Regression model
model = LinearRegression()

# Creating a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Training the model
pipeline.fit(X_train, y_train)

# Predicting on training and test sets
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# RMSE for training and test sets
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Cross-validated RMSE over 5 folds
cv_rmse = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
cv_rmse = np.sqrt(-cv_rmse)
cv_mean_rmse = cv_rmse.mean()
cv_std_rmse = cv_rmse.std()

print(f"Training RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Cross-validated RMSE (mean): {cv_mean_rmse}")
print(f"Cross-validated RMSE (std): {cv_std_rmse}")

# Comment on the performance regarding overfitting/underfitting


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_hockey.csv'

---
# Types

**Built-in Data Types by Category:**

Text Type:      **str**

Numeric Types:  **int**, **float**, **complex**

Sequence Types: **list**, **tuple**, **range**

Set Types:      **set**, **frozenset**

**List vs Tuple vs Set:**

<left><img src="./Builtin/Material_01.png" alt="drawing" width="400"><left>

("frozenset" is immutable version of "set")

Mapping Type:   **dict**

Boolean Type:   **bool**

Binary Types:   **bytes**, **bytearray**, **memoryview**

None Type:      **NoneType**

In [1]:
print('------Text------')

a = 'Hello'
print('\na is {}'.format(a))
print(type(a))

------Text------

a is Hello
<class 'str'>


In [4]:
print('------Numeric------')

b = '10'
print('\nb is {}'.format(b))
print(type(b))

c = '10.1'
print('\nc is {}'.format(c))
print(type(c))

d = 2j
print('\nd is {}'.format(d))
print(type(d))

------Numeric------

b is 10
<class 'str'>

c is 10.1
<class 'str'>

d is 2j
<class 'complex'>


In [6]:
print('------Sequence------')

e = ["apple", "banana", "cherry"]
print('\ne is {}'.format(e))
print(type(e))

f = ("apple", "banana", "cherry")
print('\nf is {}'.format(f))
print(type(f))

g = range(6)
print('\ng is {}'.format(g))
print(type(g))

------Sequence------

e is ['apple', 'banana', 'cherry']
<class 'list'>

f is ('apple', 'banana', 'cherry')
<class 'tuple'>

g is range(0, 6)
<class 'range'>


In [7]:
print('------Set------')

h = {"apple", "banana", "cherry"}
print('\nh is {}'.format(h))
print(type(h))

i = frozenset({"apple", "banana", "cherry"})
print('\ni is {}'.format(i))
print(type(i))

------Set------

h is {'apple', 'banana', 'cherry'}
<class 'set'>

i is frozenset({'apple', 'banana', 'cherry'})
<class 'frozenset'>


In [8]:
print('------Mapping------')

j = {"name" : "John", "age" : 36}
print('\nj is {}'.format(j))
print(type(j))

------Mapping------

j is {'name': 'John', 'age': 36}
<class 'dict'>


In [12]:
print('------Boolean------')

k = False
print('\nk is {}'.format(k))
print(type(k))

------Boolean------

k is False
<class 'bool'>


In [13]:
print('------Binary------')

l = b"Hello"
print('\nl is {}'.format(l))
print(type(l))

m = bytearray(5)
print('\nm is {}'.format(m))
print(type(m))

n = memoryview(bytes(5))
print('\nn is {}'.format(n))
print(type(n))

------Binary------

l is b'Hello'
<class 'bytes'>

m is bytearray(b'\x00\x00\x00\x00\x00')
<class 'bytearray'>

n is <memory at 0x107d4f400>
<class 'memoryview'>


In [11]:
print('------None------')

o = None
print('\no is {}'.format(o))
print(type(o))

------None------

o is None
<class 'NoneType'>


---
# Conditionals

In [14]:
x = 435.10
y = 435.4

if x < y:
    print('x < y: x is {} and y is {}'.format(x, y))

x < y: x is 435.1 and y is 435.4


In [18]:
a = True
b = False

if a and b:
    print('Expression is true')
else:
    print('Expression is false')

Expression is false


---
# For Loop

In [None]:
words = ['one', 'two', 'three', 'four', 'five']
for i in words:
    print(i)

print('\n')

#Index for each one (Make tuple)
for i in enumerate(words):
    print(i)

print('\n')

for i in range(len(words)):
    print(i)

---
# While Loop

In [35]:
# A recurrence relation: simple fibonacci series
a, b = 0, 1
while b < 1000:
    print(b, end = ' ', flush = True)
    a, b = b, a + b

1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 

In [None]:
# Enter the correct password, or I won't let you proceed :-)

secret = 'Python'
password = ''

while password != secret:
    password = input("What's the password? ") 
else:
  print("Passeord is correct!")

---
# Function

In [None]:
def sum(a,b):
    c = a+b
    return print('Sum is {}'.format(c))

sum(1,2)

In [None]:
# Get Python version
import platform

def message():
    print('This is python version {}'.format(platform.python_version()))

def main():
    message()

if __name__ == '__main__': main()

---
# Function + Conditional + For

In [None]:
# Find out if a given number is prime

def isprime(n):
    if n <= 1:
        return False
    for x in range(2, n):
        if n % x == 0:
            return False
    else:
        return True

n = 5
if isprime(n):
    print(f'{n} is prime')
else:
    print(f'{n} not prime')

In [None]:
# One liner

hungry = True
x = 'Feed the python now!' if hungry else 'Do not feed the python.'
print(x)

---
# Class

In [None]:
# Let's use a Class to determine the characteristics of an anaconda (the animal - not the IDE)

class anaconda:
    def sounds(self):
        print('Hisses like a snake!')

    def moves(self):
        print('Slithers like a snake!')

In [None]:
# sound and move 

python = anaconda()
python.sounds()

print('-----')

python.moves()

In [None]:
# inheritance

def main():
    cobra = anaconda()
    cobra.sounds()
    cobra.moves()

if __name__ == '__main__': main()

In [None]:
class snake:
  def __init__(self, s, l):
    self.specie = s
    self.length = l

anaconda = snake("Python", 2)

print(anaconda.specie)
print('-----')
print(anaconda.length) 
print('-----')
print(type(anaconda)) 

---
# *args

In [None]:
# It is used to pass a variable number of arguments to a function.

def my_sum(my_integers):
    result = 0
    for x in my_integers:
        result += x
    return result

list_of_integers = [1, 2, 3]
print(my_sum(list_of_integers))

# This is where *args can be really useful, because it allows you to pass 
# a varying number of positional arguments:
def my_sum(*integers):
    result = 0
    for x in integers:
        result += x
    return result

print(my_sum(1, 2, 3))


In [None]:
# another example
def main():
    kitten('meow', 'grrr', 'purr') # kitten()

def kitten(*args):
    if len(args):
        for s in args:
            print(s)
    else: print('Meow.')

if __name__ == '__main__': main()

---
# **kwargs

In [None]:
# It works just like *args, but instead of accepting positional arguments it accepts keyword (or named) arguments

def main():
    kitten(Honey = 'meow', Daisy = 'grr', Ed = 'rawr')

def kitten(**kwargs):
    if len(kwargs):
        for k in kwargs:
            print('Kitten {} says {}.'.format(k, kwargs[k]))
    else: print('Meow.')

if __name__ == '__main__': main()

---
# Decorator

In [None]:
# A decorator is a function that takes another function and extends
# the behavior of the latter function without explicitly modifying it.

#### @decorator
#### def func():
####     ...
 
# is equivalent to:
 
#### def func():
####     …
#### func = decorator(func)

# A simple decorator:
# takes a function as argument and returns a new function that behaves like the
# original function, except it prints the function name and argument for every call
def trace(f):
    def g(*args):
        print(f.__name__, args)
        return f(*args)
    return g

@trace
def square(x):
    return x*x

@trace
def sum_of_squares(x, y):
    return square(x) + square(y)

square(3,)
# print('-----')
# sum_of_squares (3, 4)

In [None]:
import time
import builtins # sometimes need it sometimes dont depending on your platform

def elapsed_time(f):
    def wrapper():
        t1 = time.time()
        f()
        t2 = time.time()
        print(f'Elapsed time: {(t2 - t1) * 1000} ms')
    return wrapper

@elapsed_time
def big_sum():
    num_list = []
    for num in (range(0, 10000)):
        num_list.append(num)
    print(f'Big sum: {builtins.sum(num_list)}')

def main():
    big_sum()

if __name__ == '__main__': main()

---
# [NumPy](https://numpy.org/)

In [1]:
# conda install numpy
# or
# pip install numpy

# To access NumPy and its functions import it in your Python code like below.
# We shorten the imported name to "np" for better readability of code
import numpy as np 

a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

print('\n-----1-----\n')
print(a)
print('\n-----2-----\n')
print(a[1])
print('\n-----3-----\n')
print(a[1][2])
print('\n-----4-----\n')


-----1-----

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]

-----2-----

[5 6 7 8]

-----3-----

7

-----4-----



In [4]:
# uninitialized empty array
b = np.empty([2, 2], dtype=int)
print(b)
print('\n-----5-----\n')

[[0 0]
 [0 0]]

-----5-----



In [6]:
# Initialized array
b = np.zeros([2, 2], dtype=int)
print(b)
print('\n-----6-----\n')
print(b.shape)
print('\n-----7-----\n')
print(type(b))

# print(a+b) # gives error

b = np.ones(a.shape)
print('\n-----8-----\n')
print(a)
print(b)
print(a+b)

[[0 0]
 [0 0]]

-----6-----

(2, 2)

-----7-----

<class 'numpy.ndarray'>

-----8-----

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[[ 2.  3.  4.  5.]
 [ 6.  7.  8.  9.]
 [10. 11. 12. 13.]]


[Click here for a numpy cheat sheet](./Builtin/Material_02.pdf)

---
# [pandas](https://pandas.pydata.org/)

In [8]:
import pandas as pd

print('-----1-----\n')
print(pd.__version__)

df = pd.DataFrame({'num_legs': [2, 4, 2], 'num_wings': [2, 0, 0]}, index=['falcon', 'dog', 'human'])
print(df)

-----1-----

1.5.3
        num_legs  num_wings
falcon         2          2
dog            4          0
human          2          0


In [9]:
print('-----2-----\n')
print(df.info())

-----2-----

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, falcon to human
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   num_legs   3 non-null      int64
 1   num_wings  3 non-null      int64
dtypes: int64(2)
memory usage: 72.0+ bytes
None


In [10]:
print('-----3-----\n')
print(df.head(n=10))

-----3-----

        num_legs  num_wings
falcon         2          2
dog            4          0
human          2          0


In [11]:
print('-----4-----\n')
print(df.tail(n=1))

-----4-----

       num_legs  num_wings
human         2          0


In [12]:
print('-----5-----\n')
print(df['num_legs']+df['num_wings'])

-----5-----

falcon    4
dog       4
human     2
dtype: int64


In [13]:
print('-----6-----\n')
print(df.dtypes)

-----6-----

num_legs     int64
num_wings    int64
dtype: object


In [None]:
print('\n-----7-----\n')
data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
df2 = pd.DataFrame(data, columns=['c', 'a'])
df2.head()

# can use "astype" to change type
# df = df.astype({'c': 'int64', 'a': 'int64'})

[Click here for a pandas cheat sheet](./Builtin/Material_03.pdf)

[Click here for a nice pandas tutorial for data manipulation skills](https://www.kaggle.com/learn/pandas)

---
# In-class exercise: Code a soccer lineups generator for the coach

Imagine you have been given a list of 22 soccer players along with their performance scores (PS) measured out of 11:

|Player Name | PS|
|---|---|
|James|1|
|Robert|2|
|John|3|
|Michael|4|
|David|5|
|William|6|
|Richard|7|
|Joseph|8|
|Thomas|9|
|Charles|10|
|Christopher|11|
|Mary|1|
|Patricia|2|
|Jennifer|3|
|Linda|4|
|Elizabeth|5|
|Barbara|6|
|Susan|7|
|Jessica|8|
|Sarah|9|
|Karen|10|
|Lisa|11|

The coach wants you to develop a random roster generator that operates in a fair way. Meaning, upon every execution of your code, it must output two random lineups (each composed of 11 players) such that __no two players with the same performance scores fall in the same team__. For example, one output can be:

|Team 1 | PS |Team 2 | PS |
|-------|------|-------|------|
|James      |1 |Mary       |1 |
|Robert     |2 |Patricia   |2 |
|John       |3 |Jennifer   |3 |
|Michael    |4 |Linda      |4 |
|David      |5 |Elizabeth  |5 |
|William    |6 |Barbara    |6 |
|Richard    |7 |Susan      |7 |
|Joseph     |8 |Jessica    |8 |
|Thomas     |9 |Sarah      |9 |
|Charles    |10|Karen      |10|
|Christopher|11|Lisa       |11|

Or, another output can be:

|Team 1 | PS |Team 2 | PS |
|-------|------|-------|------|
|James      |1 |Mary       |1 |
|Patricia   |2 |Robert     |2 |
|John       |3 |Jennifer   |3 |
|Michael    |4 |Linda      |4 |
|David      |5 |Elizabeth  |5 |
|William    |6 |Barbara    |6 |
|Richard    |7 |Susan      |7 |
|Jessica    |8 |Joseph     |8 |
|Thomas     |9 |Sarah      |9 |
|Charles    |10|Karen      |10|
|Christopher|11|Lisa       |11|

Write three versions of your code each in a separate cell:
1. Version 1 - Use sequence type __*list*__ for both storing original data and outputing lineups.
2. Version 2 - Use sequence type __*tuple*__ for both storing original data and outputing lineups.
3. Version 3 - Use __*pandas dataframe*__ for both storing original data and outputing lineups.
   
Caveats:
1. Outputted lineups must show not only players names but also their corresponding Performance Scores.
2. Your code is supposed to be a __random__ lineup generator meaning the suggested lineups of every execution of your code must be different from previous executions.
3. Random does not mean unique, this is not a problem with infinite number of solutions.

---
## Version 1 - Using sequence type list

In [None]:
import random
# pip install tabulate
from tabulate import tabulate

original_data = [['James',1],['Robert',2],['John',3],['Michael',4],['David',5],['William',6],['Richard',7],
          ['Joseph',8], ['Thomas',9],['Charles',10],['Christopher',11],['Mary',1],['Patricia',2],['Jennifer',3],['Linda',4],
          ['Elizabeth',5],['Barbara',6],['Susan',7],['Jessica',8],['Sarah',9],['Karen',10],['Lisa',11]]

print ('The type of original data is', type(original_data) )

ppl = len(original_data)
team_1 = []
team_2 = []

for i in random.sample(range(ppl),ppl):
  if original_data[i][1] in [x[1] for x in team_1]:
    team_2.append(original_data[i])
  else:
    team_1.append(original_data[i])

output = [team_1.sort (key = lambda x:x[1]),team_2.sort (key = lambda x:x[1])]
print('The type of lineups is ',type(output))
print(tabulate({'Team 1': team_1,'Team 2': team_2}, headers="keys"))

---
## Version 2 - Using sequence type tuple

In [None]:
import random
from tabulate import tabulate

original_data = (('James',1),('Robert',2),('John',3),('Michael',4),('David',5),('William',6),('Richard',7),
          ('Joseph',8), ('Thomas',9),('Charles',10),('Christopher',11),('Mary',1),('Patricia',2),('Jennifer',3),('Linda',4),
          ('Elizabeth',5),('Barbara',6),('Susan',7),('Jessica',8),('Sarah',9),('Karen',10),('Lisa',11))

print ('The type of original data is', type(original_data))

#data = sorted(original_data, key = lambda x :x[1])
ppl = len(original_data)
team_1 = ()
team_2 = ()

for i in random.sample(range(ppl),ppl):
  if original_data[i][1] in [x[1] for x in team_1]:
    team_2 = (original_data[i],) + team_2
  else:
    team_1 = (original_data[i],) + team_1

team_1, team_2 = (sorted(team_1, key = lambda x :x[1]), sorted(team_2, key = lambda x :x[1]))
output = team_1, team_2
print('The type of lineups is ',type(output))
print(tabulate({'Team 1': team_1, 'Team 2': team_2}, headers="keys"))

---
## Version 3 - Using pandas dataframe

In [None]:
import pandas as pd

original_data = pd.DataFrame([['James',1],['Robert',2],['John',3],['Michael',4],['David',5],['William',6],['Richard',7],
          ['Joseph',8], ['Thomas',9],['Charles',10],['Christopher',11],['Mary',1],['Patricia',2],['Jennifer',3],['Linda',4],
          ['Elizabeth',5],['Barbara',6],['Susan',7],['Jessica',8],['Sarah',9],['Karen',10],['Lisa',11]], columns= ['Player','Performance'])

print ('The type of original data is', type(original_data))

team_1 = original_data.groupby('Performance').sample(n=1).sort_values(by=['Performance']).reset_index(drop=True)
team_2 = original_data[~original_data['Player'].isin(team_1['Player'])].sort_values(by=['Performance']).reset_index(drop=True)

output = pd.concat([team_1, team_2.reindex(team_1.index)], axis=1)
print('The type of lineups is ',type(output))

print (output)

---