- **linear regression** refers to plotting / predicting with a straight regression line
- **m,b = np.polyfit(x,y,1)** is used to get slope (m) and y-intercept (b)
- m, b are used to calculate y in terms of x as **m*x+b**
- **polynomial regression** refers to plotting / predicting with a curved regression line

In [1]:
# import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import Image
from scipy import stats
import pprint as pp
import sys # system

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
base_path = "/content/drive/MyDrive/____Intro-Python-Machine-Learning-Dec-2025/"

In [4]:
# append base path to system so that we can load our own modules as .py files
# the scrabble_dictionary.py contains ONE thing: a list of words called scrabbleDictionary
sys.path.append(base_path)

In [5]:
from scrabble_dictionary import scrabbleDictionary

In [6]:
# did it work? print the length and dtype of scrabbleDictionary
print(len(scrabbleDictionary), type(scrabbleDictionary))
# print every 5000th word from beginning to end of 172K word list
pp.pprint(scrabbleDictionary[::20000])

172820 <class 'list'>
['aa',
 'cafetorium',
 'deterrence',
 'gadfly',
 'juked',
 'nonsensicalness',
 'pseudepigraphies',
 'skycaps',
 'typifier']


In [7]:
# regression is that line which minimizes the squares of the distances
# from a set of dots on a scatterplot to the line

# let's just cook up some of our own x, y data
# challenge: make a list, x, of consecutive ints from 1-20
x = list(range(1,21))
print('x:',x) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ... 18, 19, 20]
y = [12,14,17,13,20,23,15,28,26,29,37,32,36,38,50,33,40,46,50,53]
print('y:',y)
# make a time series line chart (jaggy line moving up and to the right)
# plt.plot(y)
# make a scatterplot with x, y data, and make the dots hollow


# slope of a line: y = mx + b
# x, y = the (x,y) point on the line
# m = slope of the line
# b = y-intercept (where the line crosses the y-axis)
# obtain m and b (we only have x, y so far)
# np.polyfit(x,y,1) returns m, b

print('m:')
print('b:')
# m # 1 is the number of slopes of the line
# now that we have x, y m and b we can plot regression line
# y = mx + b

# "Regression Line (Least Squared Distance)"
# "Car Age vs. Speed at EZ-Pass Toll"
# 'Car Age in Years'
# 'Car Speed in MPH'
# # we need m for the slope of the reg line, b is y-intercept
# # and we need b to know where the reg line cxs the y-axis


x: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
y: [12, 14, 17, 13, 20, 23, 15, 28, 26, 29, 37, 32, 36, 38, 50, 33, 40, 46, 50, 53]
m:
b:


**map(function, list)** as alternative to looping a list

In [8]:
# Scrabble word score checker challenge:
scrabble_values = {
    "A": 1, "B": 3, "C": 3, "D": 2, "E": 1,
    "F": 4, "G": 2, "H": 4, "I": 1, "J": 8,
    "K": 5, "L": 1, "M": 3, "N": 1, "O": 1,
    "P": 3, "Q": 10, "R": 1, "S": 1, "T": 1,
    "U": 1, "V": 4, "W": 4, "X": 8, "Y": 4,
    "Z": 10, " ": 0, "-": 0
}

# print a few letter values:
# pp.pprint(scrabble_values)
print(scrabble_values["Z"])
print(scrabble_values["X"])

10
8


In [9]:
# calculate scrabble score from user input word:
# assume valid input (no gibberish, digits or special chars)
# word = input("Enter word, get Scrabble Score:")
# # initialize score
# score = 0
# # loop the word:
# d = {"word":word} # new empty dict to hold result
# for char in word:
#   score += scrabble_values[char.upper()]

# d["scrabble_score"] = score # add score as dict prop
# # add an "avg_letter_value" key, the value of which is:
# d["avg_letter_value"] = round(score / len(word),3)

# print(f"scrabble dict:\n", d)

In [10]:
# get Scrabble scores of a whole list of words:
fruits = ['apple', 'apricot', 'banana', 'blueberry', 'grape', 'grapefruit', 'kiwi', 'lemon', 'lime', 'mango',
          'orange', 'papaya', 'peach', 'pear', 'pineapple', 'plum', 'raspberry', 'strawberry', 'tangerine']

In [11]:
words_list_1 = ['car', 'cat', 'dog', 'truck', 'zebra', 'house']
words_list_2 = ['iguana', 'igloo', 'jaguar', 'juniper', 'quail', 'quarz']
words_list_3 = ['xylophone', 'zither', 'mongoose', 'gazelle', 'puma', 'topaz']

In [12]:
# how to combine the four lists into ONE: extend() doesn't work in one line:
# words_list_1.extend([fruits, words_list_2,words_list_3])
# NO! Results in nested list

In [13]:
# bundle the lists in one set of square brackets
# put * in front of each name to unpack it
# set equal to new list
words_list = [*fruits, *words_list_1, *words_list_2, *words_list_3]

In [14]:
pp.pprint(words_list)

['apple',
 'apricot',
 'banana',
 'blueberry',
 'grape',
 'grapefruit',
 'kiwi',
 'lemon',
 'lime',
 'mango',
 'orange',
 'papaya',
 'peach',
 'pear',
 'pineapple',
 'plum',
 'raspberry',
 'strawberry',
 'tangerine',
 'car',
 'cat',
 'dog',
 'truck',
 'zebra',
 'house',
 'iguana',
 'igloo',
 'jaguar',
 'juniper',
 'quail',
 'quarz',
 'xylophone',
 'zither',
 'mongoose',
 'gazelle',
 'puma',
 'topaz']


In [15]:
# BONUS: Check each word against the Scrabble dictionary of 180K words
# add a few fake words:
words_list.extend(['aafad','ffgad', 'jwtreagag', 'nasorqew', 'weqguidgh'])
words_list.sort() # alphabetize the words so that fake words get mixed in

In [16]:
pp.pprint(words_list)

['aafad',
 'apple',
 'apricot',
 'banana',
 'blueberry',
 'car',
 'cat',
 'dog',
 'ffgad',
 'gazelle',
 'grape',
 'grapefruit',
 'house',
 'igloo',
 'iguana',
 'jaguar',
 'juniper',
 'jwtreagag',
 'kiwi',
 'lemon',
 'lime',
 'mango',
 'mongoose',
 'nasorqew',
 'orange',
 'papaya',
 'peach',
 'pear',
 'pineapple',
 'plum',
 'puma',
 'quail',
 'quarz',
 'raspberry',
 'strawberry',
 'tangerine',
 'topaz',
 'truck',
 'weqguidgh',
 'xylophone',
 'zebra',
 'zither']


In [44]:
# calculate scrabble scores for list of words:
# verify word against scrabble dict of 180K words
list_of_dicts = [] # a list to hold the dict results
fake_words_list = []

In [45]:
# loop the list of words:
for wd in words_list:
  if wd in scrabbleDictionary:
    score = 0 # initialize score
    vowel_count = 0
    # loop the word:
    wd_dict = {"word":wd} # new empty dict to hold result
    # loop the individual word, letter by letter:
    for letter in wd:
      # look up the letter's score in scrabble dict
      # and increment score
      score += scrabble_values[letter.upper()]
      if letter.lower() in 'aeiou':
        vowel_count += 1
    # after the inner word loop is done, but still in outer list loop:
    wd_dict["scrabble_score"] = score # make dict property for word score
    # add an "avg_letter_value" key, the value of which is:
    wd_dict["avg_letter_value"] = round(score/len(wd),3)
    wd_dict["vowel_count"] = vowel_count
    wd_dict["vowel_freq"] = round(vowel_count/len(wd),3)
    # add the completed dictionary to the list of dictionaries:
    list_of_dicts.append(wd_dict)
  else:
      fake_words_list.append(wd)

# print the final result: a list of dictionaries or 3 properties each:
print("Fake words (not in Scrabble Dictionary):\n", fake_words_list)
print()
pp.pprint(list_of_dicts)

Fake words (not in Scrabble Dictionary):
 ['aafad', 'ffgad', 'jwtreagag', 'nasorqew', 'quarz', 'weqguidgh']

[{'avg_letter_value': 1.8,
  'scrabble_score': 9,
  'vowel_count': 2,
  'vowel_freq': 0.4,
  'word': 'apple'},
 {'avg_letter_value': 1.571,
  'scrabble_score': 11,
  'vowel_count': 3,
  'vowel_freq': 0.429,
  'word': 'apricot'},
 {'avg_letter_value': 1.333,
  'scrabble_score': 8,
  'vowel_count': 3,
  'vowel_freq': 0.5,
  'word': 'banana'},
 {'avg_letter_value': 1.778,
  'scrabble_score': 16,
  'vowel_count': 3,
  'vowel_freq': 0.333,
  'word': 'blueberry'},
 {'avg_letter_value': 1.667,
  'scrabble_score': 5,
  'vowel_count': 1,
  'vowel_freq': 0.333,
  'word': 'car'},
 {'avg_letter_value': 1.667,
  'scrabble_score': 5,
  'vowel_count': 1,
  'vowel_freq': 0.333,
  'word': 'cat'},
 {'avg_letter_value': 1.667,
  'scrabble_score': 5,
  'vowel_count': 1,
  'vowel_freq': 0.333,
  'word': 'dog'},
 {'avg_letter_value': 2.429,
  'scrabble_score': 17,
  'vowel_count': 3,
  'vowel_freq': 

In [22]:
# make a df from the scrabble list of dicts:
scrabble_df = pd.DataFrame(list_of_dicts)

In [24]:
print(scrabble_df.shape) # (rows,cols) (36,3)
scrabble_df.head()

(36, 3)


Unnamed: 0,word,scrabble_score,avg_letter_value
0,apple,9,1.8
1,apricot,11,1.571
2,banana,8,1.333
3,blueberry,16,1.778
4,car,5,1.667


In [25]:
# TWO important things you MUST be able to do w DATA:
# sort and filter
# first sort by score, high to low:
by_scores_df = scrabble_df.sort_values(by="scrabble_score", ascending=False)

In [None]:
# show the top 10 -- no new df for this, just use .loc or .iloc
# by_scores_df.iloc[:10,:]
by_scores_df[:10] # also works

In [36]:
# challenge: sort by avg letter value
# then show the top
by_avg_df = scrabble_df.sort_values(by="avg_letter_value", ascending=False)

In [38]:
by_avg_df

Unnamed: 0,word,scrabble_score,avg_letter_value
34,zebra,16,3.2
31,topaz,16,3.2
35,zither,18,3.0
27,quail,14,2.8
15,kiwi,11,2.75
33,xylophone,24,2.667
7,gazelle,17,2.429
22,peach,12,2.4
13,jaguar,14,2.333
14,juniper,16,2.286


In [35]:
# test the function on a few words:

print('apple_score:')
print('bunny_score:')

apple_score:
bunny_score:


In [None]:
# loop the fruits list again BUT this time, do not calc the score directly inside loop,
# rather call the function and let it do it


# pp.pprint(words_scrabble_scores_dict)

**new_list = list(map(function, list))** runs function on each item in list
- each individual list item is passed to function as its argument
- map returns a new list of the return values of the function

In [None]:
# define a function that takes in an int argument; if the int is even, return the square, else return the cube


In [None]:
# make a list of consec ints from 1-20:
print()

In [None]:
# loop the nums_list, and call the func w each iteration, passing the current num to the func as its arg; save the result to a new list

print()

In [None]:
# map version of the above: no loop

print() # <map object at 0x7f836a1e69b0>


- **lambda** in an anonymous function
- it can be used inline as a function argument where a function arg is expected
- **lambda x : x ** 2** returns the square of x
- **lambda input : output** returns the square of x
- **lambda argument : return_value** returns the square of x
- **variable = lambda x : x ** 2** returns a function which you can store in a variable

In [None]:
# map version of the above: no loop, no external function call--use lambda instead; also generate the list to iterate right there inside map:
# map(lambda, list)

print() # <map object at 0x7f836a1e69b0>

In [None]:
# scrabble score calculator as external function vs map(lambda,list)
pets = ['bunny','cat','dog','ferret','gerbil']

In [None]:
# calc pets scrabble scores w map(func,list)
print()

#### **map(lambda, list) for calculating scrabble scores for list of words**
- **for ch in w** → loop through each letter in the word

- **ch.upper()** → make it uppercase

- **scrabble_values[ch.upper()]** → get that letter’s score

- **sum(...)** → add all scores for that word

- **map(..., pets)** → do it for every word in the list

- **list(...)** → turn the map result into a list

In [None]:
fruits = ['apple', 'apricot', 'banana', 'blueberry', 'grape', 'grapefruit', 'kiwi', 'lemon', 'lime', 'mango',
          'orange', 'papaya', 'peach', 'pear', 'pineapple', 'plum', 'raspberry', 'strawberry', 'tangerine']

# another map() example before moving on:
# challenge: using the fruits list, make treats according to these rules:
#            - if the fruit ends with a vowel, make a roll-up
#            - elif the fruit is a berry make "Boo-" brand cereal ("Boo-Strawberry") capitalize the fruit
#            - else make a popsicle
#            don't just print the treats -- store them in a new list called treats

**predicting speeds of cars through a toll based on age of car**
- knowing the age, can you predict the speed, based on known data

In [None]:
# polynomial regression
# x = hour of the day the car passes through an EZ-Pass
x = [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
# y = EZ-Pass toll booth speed
y = [100, 90, 80, 60, 60, 55, 60, 65, 70, 70, 75, 76, 78, 79, 90, 99, 99, 100]
# scatter plot the x,y dots:
plt.scatter(x,y,s=10)
plt.ylim(40,130)
plt.title('Avg Speed of Cars through EZ-Pass by Hour')
plt.xlabel('Hour (0-23)')
plt.ylabel('Speed (KMPH)')


# np.polyfit(x,y,num_of_slopes)
# this returns a method to call later
predict_polynom = np.poly1d(np.polyfit(x,y,3))

# np.linspace(start-x, end-x, max-y)
curve = np.linspace(0,23,100)

# plot the curvy best fit line, using the polynom_predict function and the regression_line
plt.plot(curve, predict_polynom(curve), color='coral')

# predict speeds by passing hrs to the predict_polynom()
_0_hr_speed = round(predict_polynom(0))
print("hour 0 (midnight to 12:59am) predicted speed in kmph:", _0_hr_speed)

# plot the 0 hr prediction as (0,114) dot
# plt.scatter(0,hr0_speed,marker="^",color="#486",s=100)
plt.scatter(0,_0_hr_speed,marker='^',color='forestgreen',s=50)
# # label the new dot with its numeric value
plt.text(0+0.5,_0_hr_speed-1.5, str(_0_hr_speed), c='forestgreen')

hr4_speed = round(predict_polynom(4))
print("hour 4 (4:00-4:59AM) predicted speed in kmph:", hr4_speed)

plt.scatter(4,hr4_speed,marker="^",color="forestgreen",s=50)
# # label the new dot with its numeric value
plt.text(4,hr4_speed+3,str(hr4_speed),ha='center',color="forestgreen")

# print("hour 11 (11:00-11:59AM) predicted speed in kmph:",

# print("hour 17 (5:00-5:59PM) predicted speed in kmph:",

# print("hour 20 (8:00-8:59PM) predicted speed in kmph:",

# print("hour 23 (11:00-11:59PM) predicted speed in kmph:",

# above predictions on a loop:
# hrs_to_predict = [0,4,11,17,20,23]

x_list = [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]

plt.show()

In [None]:
# d.) BONUS: map() takes a function as its argument -- in the predict_speed example, that argument was a call
#.    to a named, external function BUT you can pass map an anonymous lambda function -- no outside function needed
# tre
# pp