### Homework: Pandas Basics

#### Part 1: Reading Files  
1. **`chinook.db`**  
   - Use the `sqlite3` library to connect to the database.  
   - Read the `customers` table into a pandas DataFrame. Display the first 10 rows.  

2. **`iris.json`**  
   - Load the JSON file into a DataFrame. Show the shape of the dataset and the column names.  

3. **`titanic.xlsx`**  
   - Load the Excel file into a DataFrame. Use `head` to display the first 5 rows.  

4. **Flights parquet file**  
   - Read the Parquet file into a DataFrame and use `info` to summarize it.  

5. **`movie.csv`**  
   - Load the CSV file into a DataFrame and display a random sample of 10 rows.

---

In [1]:
import sqlite3
import pandas as pd
import os

In [2]:
#Part 1 Task 1
def connect_to_chinook():
    
    try:

        with sqlite3.connect('../data/chinook.db') as chinook:
            customers_table = pd.read_sql(
                'SELECT * FROM customers',
                con = chinook
            )

        return customers_table
    
    except sqlite3.OperationalError as e:
        print(f'Error occured {e}')
        return None
    
    except Exception as e:
        print(f'Something unexpected happened {e}!')
        return None

In [3]:
# Part 1 Task 2
def open_iris():

    try:
        path = r'../data/iris.json'
        if not os.path.exists(path):
            print(f'File not found in {path}')
            return None
        data_from_iris = pd.read_json(path)

        return data_from_iris

    except Exception as e:
        print(f'Something unexpected happen {e}!')
        return None
    


In [4]:
# Part 1 Task 3
def load_titanic():
    try:
        path = r'../data/titanic.xlsx'
        if not os.path.exists(path):
            print(f'File not found in {path}')
            return None
        excel_data = pd.read_excel(path, sheet_name=0)
        return excel_data

    except Exception as e:
        print(f'Error occured {e}')
        return None


In [18]:
# Part 1 Task 4
def open_parquet():
    try:
        path = r'../data/flights'
        if not os.path.exists(path):
            print(f'File does not exist in {path}')
            return None
        data_flights = pd.read_parquet(path)
        return data_flights
    except Exception as e:
        print(f'Something unexpected happen {e}')
        return None

In [6]:
# Part 1 Task 5
def read_movies_csv_file():
    try:
        path = r'../data/movie.csv'
        if not os.path.exists(path):
            print(f'File does not exist in {path}')
            return None
        movies = pd.read_csv(path)
        
        return movies
    except Exception as e:
        print(f'Error occured {e}')      

---

#### Part 2: Exploring DataFrames  
1. Using the DataFrame from **`iris.json`**:  
   - Rename the columns to lowercase.  
   - Select only the `sepal_length` and `sepal_width` columns.  

2. From the **`titanic.xlsx`** DataFrame:  
   - Filter rows where the age of passengers is above 30.  
   - Count the number of male and female passengers (`value_counts`).  

3. From the **Flights parquet file**:  
   - Extract and print only the `origin`, `dest`, and `carrier` columns.  
   - Find the number of unique destinations.  

4. From the **`movie.csv`** file:  
   - Filter rows where `duration` is greater than 120 minutes.  
   - Sort the filtered DataFrame by `director_facebook_likes` in descending order.  

---

In [7]:
# Part 2 Task 1
def work_with_iris(data_from_iris):    
    data_from_iris.rename(columns = str.lower, inplace=True)
    lowercase_columns, sepal_columns = [i for i in data_from_iris.columns], data_from_iris[['sepallength', 'sepalwidth']]
    return lowercase_columns, sepal_columns

In [8]:
# Part 2 Task 2
def work_with_titanic(titanic_data):
    above_30_age = titanic_data[titanic_data['Age'] > 30]
    number_of_male_and_females = titanic_data['Sex'].value_counts()
    return above_30_age, number_of_male_and_females

In [9]:
# Part 2 Task 3
def work_with_flights(data_flights):
    data_flights.rename(columns=str.lower, inplace=True)
    origin_dest_carrier = data_flights[['origin', 'dest', 'carrierdelay']]
    numbers_of_destinations = data_flights['destcityname'].value_counts()
    return origin_dest_carrier, numbers_of_destinations

In [10]:
# Part 2 Task 4
def work_with_movies(data_movies):
    duration_more_120 = data_movies[data_movies['duration'] > 120]
    duration_more_120 = duration_more_120.sort_values(by = 'director_facebook_likes', ascending = False)
    return duration_more_120

#### Part 3: Challenges and Explorations  
 
- From **`iris.json`**: Calculate the mean, median, and standard deviation for each numerical column.  
- From **`titanic.xlsx`**: Find the minimum, maximum, and sum of passenger ages.  

- From **`movie.csv`**:  
    - Identify the director with the highest total `director_facebook_likes`.  
    - Find the 5 longest movies and their respective directors.  

- From **Flights parquet file**:  
    - Check for missing values in the dataset. Fill missing values in a numerical column with the column’s mean.  

In [11]:
# Part 3 Task 1
def calculation_iris(data_from_iris):
    statistics = data_from_iris.loc[:, ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']].agg(['mean', 'median', 'std'], axis = 0)
    return statistics

In [12]:
# Part 3 Task 2
def calculation_titanic(titanic_data):
    statistics = titanic_data['Age'].agg(['min', 'max', 'sum'], axis = 0)
    return statistics

In [13]:
# Part 3 Task 3
def statistics_movies(data_movies):
    director_likes = data_movies[['director_name', 'director_facebook_likes']].sort_values(by='director_facebook_likes', ascending=False)
    director_most_liked = director_likes.loc[0]
    five_long_films_and_directors = data_movies[['director_name', 'duration']].sort_values(by='duration', ascending=False).head(5)
    return director_most_liked, five_long_films_and_directors


In [14]:
# Part 3 Task 4
def fill_None_in_flights(data_flights):
    numeric_columns = data_flights.select_dtypes(include='number')
    for column in numeric_columns.columns:
        data_flights[column].fillna(data_flights[column].mean(), inplace=True)
    return data_flights

In [16]:
def main():

    print('Part 1 Results\nTask 1\n')
    customers_table = connect_to_chinook()
    result = customers_table.head(10)
    if result is not None:
        print(result)
    else:
        print('Can not connect data')
    
    print('Task 2\n')
    data_from_iris = open_iris()
    shape, column_names = data_from_iris.shape, data_from_iris.columns
    if shape is not None and column_names is not None:
        print(shape)
        print([i for i in column_names])
    else:
        print('Problem with results')

    print('Task 3\n')
    titanic_data = load_titanic()
    print(titanic_data.head(5))
    
    print('Task 4\n')
    data_flights = open_parquet()
    print(data_flights.info())
    
    print('Task 5\n')
    movies = read_movies_csv_file()
    print(movies.sample(10))
    
    print('Part 2 Results\n Task 1\n')
    lowercase_columns, sepal_columns = work_with_iris(data_from_iris)
    print(lowercase_columns)
    print(sepal_columns)

    print('Task 2\n')
    above_30_age, number_of_male_and_females = work_with_titanic(titanic_data)
    print(above_30_age)
    print(number_of_male_and_females)
    
    print('Task 3\n')
    origin_dest_carrier, numbers_of_destinations = work_with_flights(data_flights)
    print(origin_dest_carrier)
    print(numbers_of_destinations)

    print('Task 4\n')
    print(work_with_movies(movies))

    print('Part 3 Results\n Task 1')
    print(calculation_iris(data_from_iris))
    print('Task 2\n')
    print(calculation_titanic(titanic_data))

    print('Task 3\n')
    director_most_liked, five_long_films_and_directors = statistics_movies(movies)
    print(director_most_liked)
    print(five_long_films_and_directors)
    
    print('Task 4\n')
    print(fill_None_in_flights(data_flights))

In [17]:
if __name__ == '__main__':
    main()

Part 1 Results
Task 1

   CustomerId  FirstName     LastName  \
0           1       Luís    Gonçalves   
1           2     Leonie       Köhler   
2           3   François     Tremblay   
3           4      Bjørn       Hansen   
4           5  František  Wichterlová   
5           6     Helena         Holý   
6           7     Astrid       Gruber   
7           8       Daan      Peeters   
8           9       Kara      Nielsen   
9          10    Eduardo      Martins   

                                            Company  \
0  Embraer - Empresa Brasileira de Aeronáutica S.A.   
1                                              None   
2                                              None   
3                                              None   
4                                  JetBrains s.r.o.   
5                                              None   
6                                              None   
7                                              None   
8                             