# **Importing Pandas**

In [None]:
import pandas as pd
import numpy as np

# **Pandas**

**Pandas** is a powerful **Python library for data manipulation and analysis.** It provides easy-to-use data structures and functions to work with structured data like tabular, time series, or matrix data.

**Pandas primarily provides two data structures: Series and DataFrame.**

**Series:** A one-dimensional labeled array capable of holding any data type.

**DataFrame:** A two-dimensional labeled data structure with columns of potentially different types.

# **Pandas - Series**

**Series** in pandas is a fundamental data structure that represents a one-dimensional array of indexed data. It can hold any type of **dataâ€”integers, strings, floats, Python objects**, etc. The Series object is built on top of the NumPy array and is very similar to it but with additional capabilities like handling missing data. The indices of a pandas Series are more flexible than those in a simple NumPy array.

# Creating a Series

In [None]:
s = pd.Series([1, 3, 5, 7, 9])
print(s)

**Key Attributes**

**Values:** The data in the Series.

**Index:** The index (labels) of each data point.

# **Common Methods of Series**


# Descriptive Statistics

**s.describe():** Provides a quick summary of the data.

This method gives a statistical summary of the Series, including count, mean, standard deviation, minimum, maximum, and quartile values.

In [None]:
# Creating a Series
s = pd.Series([1, 3, 5, 7, 9])

# Descriptive statistics
print(s.describe())

**s.mean():** Computes the mean of the data.

In [None]:
# Mean of the Series
print(s.mean())

**s.std():** Computes the standard deviation.

In [None]:
# Standard deviation of the Series
print(s.std())

**s.min() and s.max():** Computes the minimum and maximum values.

In [None]:
# Minimum and maximum values
print(s.min())
print(s.max())

# Data Manipulation

**s.map(func):** Applies a function to each element in the Series.

In [None]:
# Mapping function to double the values
doubled = s.map(lambda x: x * 2)
print(doubled)

**s.apply(func):** Similar to map, but more flexible. (Can be used Data Frames as well, where as map is only for Series)

In [None]:
# Applying a function to calculate square root
sqrt = s.apply(lambda x: x ** 0.5)
print(sqrt)

**s.sort_values():** Sorts the Series.

In [None]:
# Sorting the Series
sorted_s = s.sort_values()
print(sorted_s)

**s.drop(labels):** Drops specified labels from the Series.

In [None]:
# Dropping the first element
dropped = s.drop(0)
print(dropped)

In [None]:
print(s)

# Handling Missing Data

**s.isnull():** Checks for missing values, returns a Series of booleans.

In [None]:
# Checking for missing values
print(s.isnull())

**s.notnull():** Opposite of isnull().

In [None]:
# Checking for non-null values
print(s.notnull())

**s.fillna(value):** Fills missing values with a specified value.

In [None]:
# Create a Series with missing values
s = pd.Series([1, 2, np.nan, 4, np.nan])

# Print the Series
print(s)

In [None]:
# Filling missing values with 0
filled = s.fillna(9)
print(filled)

**s.dropna():** Drops all rows that contain missing values.

In [None]:
# Creating a Series with missing values
s_with_missing = pd.Series([1, 2, None, 4, 5])

# Dropping missing values
dropped_missing = s_with_missing.dropna()
print(dropped_missing)

# Indexing, Slicing, and Filtering

**s.iloc[ ]:** Purely integer-location based indexing.

In [None]:
# Indexing by position
s = pd.Series([1, 3, 5, 7, 9])
print(s.iloc[0])  # First element
print(s.iloc[-1])  # Last element

**s.loc[ ]:** Label-based indexing.

In [None]:
# Indexing by label
print(s.loc[0])  # First element
print(s.loc[4])  # Last element

In [None]:
# Create a Series
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s)
# Accessing elements using iloc
print(s.iloc[0])  # Access the first element
print(s.iloc[1:3])  # Access elements at positions 1 and 2 (exclusive of 3)

# Accessing elements using loc
print(s.loc['a'])  # Access the element with index label 'a'
print(s.loc['b':'c'])  # Access elements with index labels 'b' and 'c'

**s[s > n]:** Filters and returns elements greater than n.

In [None]:
# Filtering elements greater than 5
filtered = s[s > 5]
print(filtered)

# Aggregation

**s.sum():** Sums up the values.

In [None]:
# Sum of the Series
print(s.sum())

**s.cumsum():** Cumulative sum.

In [None]:
# Cumulative sum of the Series
print(s)
print(s.cumsum())

**s.aggregate(func):** Aggregates using one or more operations.

In [None]:
# Aggregating using multiple operations
aggregated = s.aggregate(['sum', 'mean', 'std'])
print(aggregated)

# Creating Data Frame

In [None]:
# Define data
data = {
    'Name': ['Amit', 'Priya', 'Rahul', 'Neha', 'Ravi', 'Ananya', 'Karan', 'Sneha', 'Divya', 'Rohan'],
    'Role': ['Manager', 'Engineer', 'Analyst', 'Developer', 'Designer', 'HR', 'Manager', 'Engineer', 'Analyst', 'Developer'],
    'Email': ['amit@example.com', 'priya@example.com', 'rahul@example.com', 'neha@example.com', 'ravi@example.com', 'ananya@example.com', 'karan@example.com', 'sneha@example.com', 'divya@example.com', 'rohan@example.com'],
    'Phone Number': ['9876543210', '8765432109', '7654321098', '6543210987', '5432109876', '4321098765', '3210987654', '2109876543', '1098765432', '0987654321'],
    'Address': ['123 MG Road', '456 Brigade Road', '789 Indiranagar', '321 Jayanagar', '567 Koramangala', '890 Whitefield', '234 BTM Layout', '567 HSR Layout', '890 Malleshwaram', '123 Rajajinagar'],
    'Blood Group': ['A+', 'B-', 'O+', 'AB+', 'A-', 'O-', 'B+', 'AB-', 'A+', 'O+']
}

# Create DataFrame
team_df = pd.DataFrame(data)

# Display DataFrame
team_df.head()

In [None]:
team_df

# Merging Data Frames

In [None]:
# Team roles data
data1 = {
    'ID': [1, 2, 3, 4],
    'Name': ['John', 'Alice', 'Bob', 'Emma'],
    'Department': ['HR', 'Engineering', 'Finance', 'Marketing']
}
df1 = pd.DataFrame(data1)

data2 = {
    'ID': [1, 2, 3, 5],
    'Salary': [50000, 60000, 70000, 55000]
}
df2 = pd.DataFrame(data2)

In [None]:
merged_df = pd.merge(df1, df2, on='ID', how='inner')

In [None]:
merged_df

# **Importing Dataset**

Importing datasets into Pandas is straightforward, and Pandas supports various file formats like csv, xlsx, json, sql etc.

In [None]:
df = pd.read_csv('./datasets/filmtv_movies.csv')

# Display the first few rows of the DataFrame to understand its structure and contents
df.head()

The dataset contains information about movies, represented in a DataFrame structure.

**filmtv_id:** A unique identifier for each movie.

**title:** The title of the movie.

**year:** The release year of the movie.

**genre:** The genre of the movie.

**duration:** The duration of the movie in minutes.

**country:** The country where the movie was produced.

**directors:** Names of the directors of the movie.

**actors:** Names of the main actors in the movie.

**avg_vote, critics_vote, public_vote:** Average ratings from different sources.

**total_votes:** Total number of votes the movie received.

**description:** A short description of the movie plot.

**notes:** Additional notes or commentary about the movie.

**humor, rhythm, effort, tension, erotism:** Various attributes rated on a scale (probably from 0 to a maximum value, representing different aspects of the movie).

# **Pandas - DataFrame**

# Properties of DataFrame

**df.head(n):**
The df.head(n) method is used to view the first n rows of the DataFrame. This is particularly useful for getting a quick snapshot of the data, especially to understand the structure and the types of data contained in each column. If you don't specify n, the default number of rows displayed is 5.

In [None]:
df.head(10)  # Displays the first 10 rows of the DataFrame

**df.tail(n):**
The df.tail(n) method is similar to df.head(n) but for the end of the DataFrame. It returns the last n rows. This is useful to see the most recent or the last few entries in your data, depending on the ordering of your dataset. Like df.head(n), the default value of n is 5 if it isn't specified.

In [None]:
df.tail(10)  # Displays the last 10 rows of the DataFrame

**df.shape:**
The df.shape attribute of a DataFrame returns a tuple representing the dimensionality of the DataFrame. The first element of the tuple is the number of rows, and the second is the number of columns. This is useful when you need to know how large the dataset is, such as when you are preprocessing data or ensuring that data manipulations have executed correctly.

In [None]:
df.shape  # Outputs: (number of rows, number of columns)

**df.columns:**
The df.columns attribute returns an Index object containing the column labels of the DataFrame. Knowing the column names is essential for accessing specific data in the DataFrame, performing analyses, and for data manipulation tasks like sorting, filtering, or applying functions to certain columns.

In [None]:
df.columns  # Lists all the column names in the DataFrame

**Inspecting Data Types:** Each column in a DataFrame has a specific data type. Understanding these types is crucial for proper data manipulation

In [None]:
# Display the data types of each column
df.dtypes

**Summary Statistics:** For numerical data, it's useful to get a sense of their central tendency and spread

In [None]:
# Display summary statistics for numerical columns
df.describe()

# Accessing and Filtering:

**df.loc:**
The df.loc method is used for label-based indexing, meaning you can access rows and columns using their labels (i.e., index names and column names). It allows for selecting a subset of rows and columns from a DataFrame with powerful and flexible slicing, indexing, and filtering options.

In [None]:
df.head(5)

In [None]:
# Selecting all rows and a specific column by label
titles = df.loc[:, 'title']
titles

In [None]:
# Selecting a range of rows and multiple columns by labels
subset = df.loc[10:20, ['title', 'year', 'genre']]
subset

In [None]:
# Conditional selection using a boolean array
dramas = df.loc[df['genre'] == 'Drama']
dramas

In [None]:
multiple_condition = df.loc[(df['genre'] == 'Drama') & (df['avg_vote']>7.0)]
multiple_condition

**df.iloc:**
While df.loc uses labels for indexing, df.iloc allows for integer-based indexing. You use df.iloc to access rows and columns by their integer positions, which makes it useful when you need to access data by its position in the DataFrame.

In [None]:
df.head(5)

In [None]:
# Selecting a single row from the DataFrame
single_row = df.iloc[0]
single_row

In [None]:
# Selecting a specific row and columns by integer indices
specific_data = df.iloc[10, [1, 2, 3]]  # row at index 10 and columns at indices 1, 2, and 3
specific_data

In [None]:
# Slicing to get multiple rows and columns
multi_slice = df.iloc[10:15, 0:4]  # Rows 10 to 14 and columns 0 to 3
multi_slice

**df.at:**
df.at is designed to access a single value for a row/column label pair. It is very similar to df.loc for accessing scalar values but is optimized for faster access when you only need to get or set a single value in a DataFrame.

In [None]:
# Access a specific single value using row label and column name
title_of_first_movie = df.at[0, 'title']
title_of_first_movie

**Filtering Based on Criteria:**
Filtering data based on specific criteria is a common operation in data analysis. Pandas provides several methods to perform these operations, often using boolean indexing.

In [None]:
# Filter movies released after 2010
recent_movies = df[df['year'] > 2010]
recent_movies

In [None]:
# Movies with a high public vote and specific genre
highly_rated_thrillers = df[(df['public_vote'] >= 8) & (df['genre'] == 'Thriller')]
highly_rated_thrillers

In [None]:
# Movies from a specific country
us_movies = df[df['country'] == 'United States']
us_movies

# Updating Rows and Columns

**df.drop:**
The .drop() method in pandas is used to remove rows or columns from a DataFrame. Its primary purpose is to drop specified labels from rows or columns.

**Parameters:**

**labels:** The row or column labels to drop.

**axis:** Specifies whether the labels refer to rows (axis=0) or columns (axis=1). By default, it's 0 (rows).

**index or columns:** An alternative way to specify the labels to drop, instead of using the labels parameter. It is equivalent to specifying axis=0 (for index) or axis=1 (for columns).

**inplace:** If True, the operation is done in place, meaning it modifies the DataFrame directly and returns None. If False or not specified, it returns a new DataFrame with the specified labels dropped.

In [None]:
df.drop(labels='title',axis=1)

In [None]:
df

**Direct Assignment:**
Directly assign a value to a specific column or even a cell in a DataFrame.

In [None]:
df.at[0, 'year'] = 1983  # Changes the year of the first movie to 1983
df.head(5)

In [None]:
df['new_column'] = 'default value'  # Adds a new column with all entries set to 'default value'
df

In [None]:
df.drop(axis=1,labels='new_column',inplace=True)

In [None]:
df.head(5)

**Using loc for Conditional Updates:**
loc can be used to update rows and columns based on a condition.

In [None]:
df.loc[df['year'] < 2000, 'classic'] = True  # Marks movies before 2000 as classic
df

In [None]:
df.loc[df['avg_vote'] > 6, ['top_rated', 'must_watch']] = [True, True] # Modifying multiple columns using loc

In [None]:
df

**Using apply Function:**
The apply function allows you to apply a function along an axis of the DataFrame.

In [None]:
df['length_category'] = df['duration'].apply(lambda x: 'Long' if x > 120 else 'Short')
df

In [None]:
# Create a DataFrame with multiple Series
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
num_data = pd.DataFrame(data)

In [None]:
num_data

In [None]:
# Define a function to sum two Series
def sum_series(x, y):
    return x + y

# Apply the function on multiple Series using apply()
result = num_data.apply(lambda row: sum_series(row['A'], row['B']), axis=1)

# Print the result
print(result)

**Updating Using map or replace:**
You can update a column based on a mapping dictionary or replace values.

In [None]:
df['genre'].map({'Drama': 'Drama Film', 'Comedy': 'Comedy Film'}) # Mapping existing values to new ones

In [None]:
df['country'].replace('USA', 'United States', inplace=True) # Replacing specific values
df

**Adding New Columns Based on Calculations:**
You can create new columns based on calculations from existing columns.

In [None]:
df['title_year'] = df['title'] + " (" + df['year'].astype(str) + ")" # Creating a new column by combining existing columns
df

**Using assign to Create Columns:**
assign helps you add new columns to a DataFrame in a functional style.

In [None]:
df = df.assign(
    is_older=lambda x: x['year'] < 2000,
    duration_hours=lambda x: x['duration'] / 60
) # Adding multiple new columns

In [None]:
df

# Changing the name of Index
Pandas allows you to rename the index of a DataFrame or Series, which can help in making the index more informative or aligning it with new data requirements.

**Renaming the Index of a DataFrame:**

In [None]:
df.index.names = ['movie_id']  # Renames the index to 'movie_id'

In [None]:
df

**Renaming Column Indexes:**

In [None]:
df.rename(columns={'year': 'release_year', 'title': 'movie_title'}, inplace=True)
df

# Display Options

In [None]:
# Set maximum number of rows and columns to display
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 5)

In [None]:
df

In [None]:
# Reset Options
pd.reset_option('display')

In [None]:
df

# Grouping Data:
Grouping data is a powerful way to perform segment-wise analysis and break down the dataset into chunks based on some criteria.

In [None]:
genre_groups = df.groupby('genre')  # Groups the data by the 'genre' column
genre_groups

In [None]:
for genre, group_data in genre_groups:
    print(f"Genre: {genre}")
    print(group_data)
    print()

In [None]:
year_genre_groups = df.groupby(['release_year', 'genre'])  # Groups by year and genre

# Aggregation

After grouping, you might want to perform aggregation operations like sum, mean, count, etc., to summarize the data.

In [None]:
# Simple Aggregation - Calculating the Average
avg_duration_by_genre = df.groupby('genre')['duration'].mean()  # Average duration per genre
print(avg_duration_by_genre)

In [None]:
# Multiple Aggregations on a Single Column
stats_by_genre = df.groupby('genre')['avg_vote'].agg([np.mean, np.std, np.min, np.max])
print(stats_by_genre)

In [None]:
# Different Aggregations for Different Columns
complex_aggregation = df.groupby('genre').agg({
    'duration': np.mean,  # average duration
    'avg_vote': [np.min, np.max],  # min and max average votes
    'public_vote': 'sum'  # total of public votes
})

In [None]:
complex_aggregation

**Aggregating Without Grouping:**
Sometimes, you may want to perform aggregations without the need to group the data.

In [None]:
# Overall Summary Statistics
overall_stats = df[['duration', 'avg_vote']].describe()
overall_stats

**df.count():** This method returns the number of non-null values in each DataFrame column. It can be used to count the number of non-null values in each column individually.

In [None]:
# Create a DataFrame
data = {'A': [1, 2, None], 'B': [4, None, 6], 'C': [5, 8, 9]}
data_df = pd.DataFrame(data)

# Count non-null values in each column
counts = data_df.count()
print(counts)

**df.value_counts():** This method returns the frequency counts of unique values in a Series. It is typically used on a single column of the DataFrame and is useful for analyzing the distribution of values within that column.

In [None]:
# Value Counts
df['movie_title'].value_counts()

**Custom Aggregation Functions:**
Pandas allows you to define and use custom aggregation functions for more specific data analysis needs.

In [None]:
# Using a Custom Function for Aggregation
def range_func(series):
    return series.max() - series.min()

range_by_genre = df.groupby('genre')['duration'].agg(range_func)  # Range of durations by genre
range_by_genre

**Renaming Grouped Aggregation Results:**
It is often useful to rename the results of aggregations for clarity or further analysis.

In [None]:
# Renaming Aggregation Results
renamed_aggregations = df.groupby('genre')['avg_vote'].agg([
    ('Average Rating', 'mean'),  # Renames the mean result to 'Average Rating'
    ('Rating Standard Deviation', 'std')  # Renames the std result to 'Rating Standard Deviation'
])
renamed_aggregations