## **Essential Python for Data Analyst**

- Review `Class`
- try except
- csv file, context manager
- pip install packages
- pandas & numpy
- web scraping `gazpacho`
- API `requests`
- Basic ML `sklearn`

In [None]:
# OOP object 
class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def say_hi(self):
        print("hello!")

    def __str__(self):
        return "I am a person. My name is " + self.name + '.'

In [None]:
john = Person("John", 25)

print(john)

I am a person. My name is John.


In [None]:
# OOP inheritance
# Employee <- Person

class Employee(Person):
    def __init__(self, name, age, position, company):
        super().__init__(name, age)
        self.position = position
        self.company = company
    
    def say_hi(self):
        print(f"Hello! I'm {self.name}.")

    def killing(self):
        print("Yeah!")

In [None]:
john_wick = Employee("John Wick", 35, "Assasin", "Twitter")

In [None]:
print(john_wick.name)
print(john_wick.age)
print(john_wick.position)
print(john_wick.company)

John Wick
35
Assasin
Twitter


In [None]:
type(john_wick)

__main__.Employee

In [None]:
john_wick.say_hi()

Hello! I'm John Wick.


In [None]:
john_wick.killing() # function

Yeah!


In [None]:
john_wick.company # variable

'Twitter'

## Run Command Line in Notebook

In [None]:
!pwd

/content


In [None]:
!ls

sample_data


In [None]:
!mkdir data

In [None]:
!ls data

 chinook.db   food.txt	 hotel.csv  'Sample SuperStore.csv'


In [None]:
!cat data/food.txt

hamburger
hotdog
pizza
salad
french fried

In [None]:
!pip install gazpacho

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gazpacho
  Downloading gazpacho-1.1.tar.gz (7.9 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gazpacho
  Building wheel for gazpacho (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gazpacho: filename=gazpacho-1.1-py3-none-any.whl size=7482 sha256=b674f57b7d94616c051ff5755a22d031f2b3cbd809bc119a9851ae7c029a0590
  Stored in directory: /root/.cache/pip/wheels/ec/45/e0/490eb5e25601b4f9425fcde4a0034601c492a29e82268be4d3
Successfully built gazpacho
Installing collected packages: gazpacho
Successfully installed gazpacho-1.1


In [None]:
!pip list | grep "^gaz"

gazpacho                      1.1


In [None]:
# import package/ library/ module
import csv

In [None]:
!ls data

 chinook.db   food.txt	 hotel.csv  'Sample SuperStore.csv'


In [None]:
file = open("data/hotel.csv")

data = []

content = csv.reader(file)

for row in content:
    data.append(row)

file.close()

print(data)

[['id', 'hotel', 'location', 'pricepernight'], ['1', 'Ideo', 'Bangkok', '20'], ['2', 'Accor', 'London', '25'], ['3', 'Premium', 'Seoul', '32'], ['4', 'Super Star', 'Dubai', '50'], ['5', 'Planet', 'Tokyo', '22'], ['6', 'Joby', 'Bangkok', '30'], ['7', 'JW Marriot', 'Bangkok', '35'], ['8', 'Kenshin', 'Tokyo', '20'], ['9', 'Eloquent', 'London', '42'], ['10', 'Big Ben', 'London', '45']]


In [None]:
# context manager read/write file

In [None]:
data = []

with open("data/hotel.csv", "r") as file:
    content = csv.reader(file)
    for row in content:
        data.append(row)

print(data)

[['id', 'hotel', 'location', 'pricepernight'], ['1', 'Ideo', 'Bangkok', '20'], ['2', 'Accor', 'London', '25'], ['3', 'Premium', 'Seoul', '32'], ['4', 'Super Star', 'Dubai', '50'], ['5', 'Planet', 'Tokyo', '22'], ['6', 'Joby', 'Bangkok', '30'], ['7', 'JW Marriot', 'Bangkok', '35'], ['8', 'Kenshin', 'Tokyo', '20'], ['9', 'Eloquent', 'London', '42'], ['10', 'Big Ben', 'London', '45']]


In [None]:
# write file data.txt

header = ['id', 'name', 'city']
body_data = [
    [1, 'John', 'London'], 
    [2, 'Mary', 'Reading'],
    [3, 'Anna', 'Belgium']
]

with open("data/output.csv", "w") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(body_data)

In [None]:
!ls data

 chinook.db   food.txt	 hotel.csv   output.csv  'Sample SuperStore.csv'


In [None]:
!cat data/output.csv

id,name,city
1,John,London
2,Mary,Reading
3,Anna,Belgium


In [None]:
# How to handle error

In [None]:
# try - except block

In [None]:
try:
    print(1/0)
    print("Success!")
except ZeroDivisionError:
    print("this code is error!")
except NameError:
    print("variable not found!")
finally:
    print("Done...")

this code is error!
Done...


In [None]:
try:
    with open("data/HOTEL.csv", "r") as file:
        data = csv.reader(file)
        for row in data:
            print(row)
    print("Load data successfully!")
except FileNotFoundError:
    print("File not found. Please check file name again.")

File not found. Please check file name again.


## CSVMonster Class

In [None]:
class CSVMonster:
    def __init__(self, filename):
        self.data = [] # list
        try:
            with open(filename, "r") as file:
                content = csv.reader(file)
                for row in content:
                    self.data.append(row)
            print("Load data successfully.")
        except FileNotFoundError:
            print("Error. Please check file name again.")

    def info(self):
        columns = len(self.data[0])
        rows = len(self.data) - 1
        print(f"Columns: {columns}\nRows: {rows}")

    def filter_city(self, city):
        header = self.data[0]
        data = self.data[1:]
        result = []

        for row in data:
            if row[2] == city:
                result.append(row)
        
        print(header)
        for row in result:
            print(row)
    
    def calculate_avg_price(self):
        prices = []
        for row in self.data[1:]: # no use head 
            prices.append(int(row[3]))  # name head row

        avg_price = sum(prices) / len(prices)
        print(f"Avg. Price: {avg_price}")


In [None]:
csv_file = CSVMonster("data/hotel.csv")

Load data successfully.


In [None]:
csv_file.info()

Columns: 4
Rows: 10


In [None]:
csv_file.filter_city("London")

['id', 'hotel', 'location', 'pricepernight']
['2', 'Accor', 'London', '25']
['9', 'Eloquent', 'London', '42']
['10', 'Big Ben', 'London', '45']


In [None]:
csv_file.calculate_avg_price()

Avg. Price: 32.1


## **Intro to Numpy**

Numpy = numerical python

In [None]:
values = [1,2,3,4,5]
print(values * 2)

print(values + values + values)

[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5]


In [None]:
[value*2  for value in values]

# value * 2

[2, 4, 6, 8, 10]

In [None]:
import numpy as np

values = [1,2,3,4,5]
np_values = np.array(values)

print(type(values), type(np_values))

<class 'list'> <class 'numpy.ndarray'>


In [None]:
np_values ** 3

array([  1,   8,  27,  64, 125])

In [None]:
ages = np.array([25, 28, 30, 19, 32])

print(ages.mean())
print(ages.sum())
print(ages.min())
print(ages.max())
print(ages.std())

26.8
134
19
32
4.534313619501853


In [None]:
np.std(ages)

4.534313619501853

In [None]:
# dot notation
# multiplication
np.dot(3, 5)

15

In [None]:
a1 = np.array([1,2,3])
a2 = np.array([2,5,2])

# =SUMPRODUCT()
# = (1*2) + (2*5) + (3*2)
np.dot(a1, a2)

18

In [None]:
# matrix
a1 = np.array([
    [1,2],
    [3,4]
])

a2 = np.array([
    [5,6],
    [2,5]
])

# sum((1,2) * (6,5))
np.dot(a1, a2)

array([[ 9, 16],
       [23, 38]])

In [None]:
# start stop step
np.arange(1, 101,2)

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
       69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])

In [None]:
np.zeros((3,3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [None]:
np.ones((3,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

## **Intro to Pandas**

- information of data frame
- select columns
- filter rows
- create new column
- aggregate + summarise
- value counts

In [None]:
import pandas as pd 
import numpy as np 

In [None]:
# load csv data
df = pd.read_csv("data/store.csv")

In [None]:
# preview dataset
df.tail(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9991,9992,CA-2017-121258,2/26/2017,3/3/2017,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.576,2,0.2,19.3932
9992,9993,CA-2017-121258,2/26/2017,3/3/2017,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6,4,0.0,13.32
9993,9994,CA-2017-119914,5/4/2017,5/9/2017,Second Class,CC-12220,Chris Cortes,Consumer,United States,Westminster,...,92683,West,OFF-AP-10002684,Office Supplies,Appliances,"Acco 7-Outlet Masterpiece Power Center, Wihtou...",243.16,2,0.0,72.948


In [None]:
# information of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [None]:
df.shape # attribute

(9994, 21)

In [None]:
df.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [None]:
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [None]:
# clean column names
cols = df.columns 
clean_cols = [col.lower().replace(" ", "_").replace("-","_")  for col in cols]
df.columns = clean_cols

df.head(1)

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136


In [None]:
# select columns
df['segment'].head()

0     Consumer
1     Consumer
2    Corporate
3     Consumer
4     Consumer
Name: segment, dtype: object

In [None]:
# create new column
selected_cols = ['order_id', 'segment', 'sales', 'state', 'city']

df2 = df[selected_cols]

df2['tax'] = df2['sales'] * 0.25

df2.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tax'] = df2['sales'] * 0.25


Unnamed: 0,order_id,segment,sales,state,city,tax
0,CA-2016-152156,Consumer,261.96,Kentucky,Henderson,65.49
1,CA-2016-152156,Consumer,731.94,Kentucky,Henderson,182.985
2,CA-2016-138688,Corporate,14.62,California,Los Angeles,3.655


In [None]:
# remove columns
df2 = df2.drop(['order_id','city'], axis=1)
df2.head()

Unnamed: 0,segment,sales,state,tax
0,Consumer,261.96,Kentucky,65.49
1,Consumer,731.94,Kentucky,182.985
2,Corporate,14.62,California,3.655
3,Consumer,957.5775,Florida,239.394375
4,Consumer,22.368,Florida,5.592


In [None]:
# filter data
df[ (df['category'] == 'Furniture') & (df['segment'] == 'Home Office') ][['customer_name', 'segment', 'category']].head(20)

Unnamed: 0,customer_name,segment,category
38,Steve Nguyen,Home Office,Furniture
39,Steve Nguyen,Home Office,Furniture
66,Paul Stevenson,Home Office,Furniture
96,Parhena Norris,Home Office,Furniture
124,Alan Dominguez,Home Office,Furniture
128,Lindsay Shagiari,Home Office,Furniture
129,Lindsay Shagiari,Home Office,Furniture
146,Maureen Gastineau,Home Office,Furniture
189,Mark Packer,Home Office,Furniture
192,Mark Packer,Home Office,Furniture


In [None]:
# query() method
result = df.query("category == 'Furniture' and segment == 'Consumer' ")[['customer_name', 'segment', 'category']].tail(10)

result

Unnamed: 0,customer_name,segment,category
9897,Liz Preis,Consumer,Furniture
9898,Liz Preis,Consumer,Furniture
9901,Sung Chung,Consumer,Furniture
9903,Darren Powers,Consumer,Furniture
9917,Tamara Manning,Consumer,Furniture
9928,Ionia McGrath,Consumer,Furniture
9931,Keith Herrera,Consumer,Furniture
9980,Shaun Weien,Consumer,Furniture
9989,Tom Boeckenhauer,Consumer,Furniture
9990,Dave Brooks,Consumer,Furniture


In [None]:
# export csv file
result.to_csv("data/output_store.csv")

In [None]:
!ls data

chinook.db  food.txt  hotel.csv  output.csv  output_store.csv  store.csv


In [None]:
# value counts
count_segment = df['segment'].value_counts(normalize=True).reset_index()
count_segment.to_csv("data/segment.csv")

count_segment

Unnamed: 0,index,segment
0,Consumer,0.519412
1,Corporate,0.302181
2,Home Office,0.178407


In [None]:
# statistics (aggregate functions)
# numpy statistics
total_sales = df['sales'].sum()
avg_sales = df['sales'].mean() 
std_quantity = df['quantity'].std()

print(f"Total Sales: {round(total_sales,2)}")
print(avg_sales, std_quantity)

Total Sales: 2297200.86
229.85800083049833 2.2251096911414


In [None]:
median_sales = np.median(df['sales'])
print(median_sales)

54.489999999999995


In [None]:
# groupby + aggregate
df.groupby('segment')['sales'].agg(['sum', 'mean', 'count', 'min', 'max'])

Unnamed: 0_level_0,sum,mean,count,min,max
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Consumer,1161401.0,223.733644,5191,0.444,13999.96
Corporate,706146.4,233.8233,3020,0.556,17499.95
Home Office,429653.1,240.972041,1783,0.99,22638.48


In [None]:
result = df.groupby(['state','segment'])[['sales', 'profit']]\
    .agg(['sum','mean'])\
    .reset_index()

result.head()
# result.to_csv("data/request_data_18Nov2022.csv")

Unnamed: 0_level_0,state,segment,sales,sales,profit,profit
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,sum,mean
0,Alabama,Consumer,7537.54,301.5016,1711.0939,68.443756
1,Alabama,Corporate,10969.38,391.763571,3648.3846,130.29945
2,Alabama,Home Office,1003.72,125.465,427.3468,53.41835
3,Arizona,Consumer,16424.422,149.312927,-1423.0527,-12.936843
4,Arizona,Corporate,11736.322,170.091623,-788.9158,-11.433562


In [None]:
# OKAY : )

## **API**

API => Application Programming Interface

Request-Response cycle

import `requests`

In [None]:
import requests 
import time
import pandas as pd 

In [None]:
url ="https://swapi.dev/api/people/1"

In [None]:
response = requests.get(url)

In [None]:
response.status_code

200

In [None]:
result = response.json()

In [None]:
result['height']

'172'

In [None]:
result['mass']

'77'

In [None]:
result

{'name': 'Luke Skywalker',
 'height': '172',
 'mass': '77',
 'hair_color': 'blond',
 'skin_color': 'fair',
 'eye_color': 'blue',
 'birth_year': '19BBY',
 'gender': 'male',
 'homeworld': 'https://swapi.dev/api/planets/1/',
 'films': ['https://swapi.dev/api/films/1/',
  'https://swapi.dev/api/films/2/',
  'https://swapi.dev/api/films/3/',
  'https://swapi.dev/api/films/6/'],
 'species': [],
 'vehicles': ['https://swapi.dev/api/vehicles/14/',
  'https://swapi.dev/api/vehicles/30/'],
 'starships': ['https://swapi.dev/api/starships/12/',
  'https://swapi.dev/api/starships/22/'],
 'created': '2014-12-09T13:50:51.644000Z',
 'edited': '2014-12-20T21:17:56.891000Z',
 'url': 'https://swapi.dev/api/people/1/'}

In [None]:
names = []
heights = []
masses = []

for i in range(1,11):
    url = f"https://swapi.dev/api/people/{i}"
    resp = requests.get(url)
    result = resp.json()
    names.append(result['name'])
    heights.append(result['height'])
    masses.append(result['mass'])
    time.sleep(1)

df = pd.DataFrame({
    "name": names, "height": heights, "mass": masses
})

df

Unnamed: 0,name,height,mass
0,Luke Skywalker,172,77
1,C-3PO,167,75
2,R2-D2,96,32
3,Darth Vader,202,136
4,Leia Organa,150,49
5,Owen Lars,178,120
6,Beru Whitesun lars,165,75
7,R5-D4,97,32
8,Biggs Darklighter,183,84
9,Obi-Wan Kenobi,182,77


In [None]:
result # dictionary

{'name': 'Obi-Wan Kenobi',
 'height': '182',
 'mass': '77',
 'hair_color': 'auburn, white',
 'skin_color': 'fair',
 'eye_color': 'blue-gray',
 'birth_year': '57BBY',
 'gender': 'male',
 'homeworld': 'https://swapi.dev/api/planets/20/',
 'films': ['https://swapi.dev/api/films/1/',
  'https://swapi.dev/api/films/2/',
  'https://swapi.dev/api/films/3/',
  'https://swapi.dev/api/films/4/',
  'https://swapi.dev/api/films/5/',
  'https://swapi.dev/api/films/6/'],
 'species': [],
 'vehicles': ['https://swapi.dev/api/vehicles/38/'],
 'starships': ['https://swapi.dev/api/starships/48/',
  'https://swapi.dev/api/starships/59/',
  'https://swapi.dev/api/starships/64/',
  'https://swapi.dev/api/starships/65/',
  'https://swapi.dev/api/starships/74/'],
 'created': '2014-12-10T16:16:29.192000Z',
 'edited': '2014-12-20T21:17:50.325000Z',
 'url': 'https://swapi.dev/api/people/10/'}

## **Web Scraping**

In [None]:
!pip install gazpacho

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gazpacho
  Downloading gazpacho-1.1.tar.gz (7.9 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gazpacho
  Building wheel for gazpacho (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gazpacho: filename=gazpacho-1.1-py3-none-any.whl size=7482 sha256=31c9f7ca5b269e819190f9c1b9c1a427c3a5ba3dabd5d5c6d44008510c838017
  Stored in directory: /root/.cache/pip/wheels/ec/45/e0/490eb5e25601b4f9425fcde4a0034601c492a29e82268be4d3
Successfully built gazpacho
Installing collected packages: gazpacho
Successfully installed gazpacho-1.1


In [None]:
from gazpacho import Soup
from requests import get

In [None]:
url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

In [None]:
resp = get(url)
resp.status_code

200

In [None]:
imdb = Soup(resp.text)

In [None]:
imdb.find("h3", {'class': 'lister-item-header'}, mode='first').strip()

'1. The Shawshank Redemption (1994)'

In [None]:
titles = imdb.find("h3", {'class': 'lister-item-header'})

clean_titles = [title.strip() for title in titles]

# for title in titles:
#     clean_titles.append(title.strip())

print(clean_titles)

['1. The Shawshank Redemption (1994)', '2. The Godfather (1972)', '3. The Dark Knight (2008)', '4. The Godfather Part II (1974)', "5. Schindler's List (1993)", '6. 12 Angry Men (1957)', '7. The Lord of the Rings: The Return of the King (2003)', '8. Pulp Fiction (1994)', '9. The Lord of the Rings: The Fellowship of the Ring (2001)', '10. Inception (2010)', '11. Fight Club (1999)', '12. Forrest Gump (1994)', '13. The Lord of the Rings: The Two Towers (2002)', '14. The Good, the Bad and the Ugly (1966)', '15. Goodfellas (1990)', "16. One Flew Over the Cuckoo's Nest (1975)", '17. The Matrix (1999)', '18. Star Wars: Episode V - The Empire Strikes Back (1980)', '19. Interstellar (2014)', '20. The Silence of the Lambs (1991)', '21. The Green Mile (1999)', '22. Se7en (1995)', '23. Star Wars (1977)', '24. Terminator 2: Judgment Day (1991)', '25. Spirited Away (2001)', '26. Saving Private Ryan (1998)', '27. City of God (2002)', '28. Life Is Beautiful (1997)', "29. It's a Wonderful Life (1946)", 

In [None]:
ratings = imdb.find("div", {'class': 'ratings-imdb-rating'})
ratings = [float(rating.strip()) for rating in ratings]

print(ratings)

[9.3, 9.2, 9.0, 9.0, 9.0, 9.0, 9.0, 8.9, 8.8, 8.8, 8.8, 8.8, 8.8, 8.8, 8.7, 8.7, 8.7, 8.7, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.6, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5]


In [None]:
import pandas as pd 
imdb_movies = pd.DataFrame({
    "title": clean_titles,
    "rating": ratings
})

imdb_movies.head(10)

Unnamed: 0,title,rating
0,1. The Shawshank Redemption (1994),9.3
1,2. The Godfather (1972),9.2
2,3. The Dark Knight (2008),9.0
3,4. The Godfather Part II (1974),9.0
4,5. Schindler's List (1993),9.0
5,6. 12 Angry Men (1957),9.0
6,7. The Lord of the Rings: The Return of the Ki...,9.0
7,8. Pulp Fiction (1994),8.9
8,9. The Lord of the Rings: The Fellowship of th...,8.8
9,10. Inception (2010),8.8


## **Basic sklearn ML**