In [17]:
import pandas as pd
import numpy as np

## DataFrame
A DataFrame is like an Excel spreadsheet - it has rows and columns. But unlike Excel, it can handle millions of rows and do complex operations instantly.

**Real-world analogy**: Think of a DataFrame like a filing cabinet:

- Each **drawer** is a column (like "Name", "Age", "Salary")
- Each **folder** is a row (like information about one person)
- The **label on each drawer** is the column name
- The **number on each folder** is the row index

In [18]:
# CREATING DATAFRAMES

# Method 1: From a dictionary (most common)
# Each key becomes a column name
employee_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Salary': [50000, 65000, 70000, 58000, 52000]
}

df = pd.DataFrame(employee_data)
print(df)

# Method 2: From a list of dictionaries (each dict is a row)
employee_records = [
    {'Name': 'Alice', 'Age': 25, 'Department': 'HR', 'Salary': 50000},
    {'Name': 'Bob', 'Age': 30, 'Department': 'IT', 'Salary': 65000},
    {'Name': 'Charlie', 'Age': 35, 'Department': 'Finance', 'Salary': 70000}
]

df2 = pd.DataFrame(employee_records)
print(df2)

# Method 3: From lists (need to specify column names)
names = ['Alice', 'Bob', 'Charlie']
ages = [25, 30, 35]
departments = ['HR', 'IT', 'Finance']

df3 = pd.DataFrame({
    'Name': names,
    'Age': ages,
    'Department': departments
})

# Method 4: From numpy arrays (for numerical data)
import numpy as np
random_data = np.random.randn(5, 3)  # 5 rows, 3 columns of random numbers
df4 = pd.DataFrame(random_data, 
                   columns=['Column1', 'Column2', 'Column3'],
                   index=['Row1', 'Row2', 'Row3', 'Row4', 'Row5'])

      Name  Age Department  Salary
0    Alice   25         HR   50000
1      Bob   30         IT   65000
2  Charlie   35    Finance   70000
3    Diana   28         IT   58000
4      Eve   32         HR   52000
      Name  Age Department  Salary
0    Alice   25         HR   50000
1      Bob   30         IT   65000
2  Charlie   35    Finance   70000


## Understanding DataFrame Structure

In [19]:
# exploring our employee DataFrame
df = pd.DataFrame(employee_data)

# Basic information
print("Shape (rows, columns):", df.shape)        # (5, 4)
print("Number of rows:", len(df))                 # 5
print("Number of columns:", len(df.columns))      # 4
print("Column names:", df.columns.tolist())       # ['Name', 'Age', 'Department', 'Salary']
print("Index:", df.index.tolist())               # [0, 1, 2, 3, 4]

# Data types of each column
print("\nData types:")
print(df.dtypes)

# Memory usage
print("\nMemory usage:")
print(df.info())

Shape (rows, columns): (5, 4)
Number of rows: 5
Number of columns: 4
Column names: ['Name', 'Age', 'Department', 'Salary']
Index: [0, 1, 2, 3, 4]

Data types:
Name          object
Age            int64
Department    object
Salary         int64
dtype: object

Memory usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5 non-null      object
 1   Age         5 non-null      int64 
 2   Department  5 non-null      object
 3   Salary      5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes
None


## Basic DataFrame Operations

In [20]:
# Viewing data
print("First 3 rows:")
print(df.head(3))    # Shows first 3 rows

print("\nLast 2 rows:")
print(df.tail(2))    # Shows last 2 rows

# Accessing columns
print("\nAll names:")
print(df['Name'])    # Returns a Series

print("\nMultiple columns:")
print(df[['Name', 'Salary']])  # Returns a DataFrame

# Accessing rows
print("\nFirst row:")
print(df.iloc[0])    # Returns a Series

print("\nMultiple rows:")
print(df.iloc[0:3])  # Returns a DataFrame with rows 0, 1, 2

First 3 rows:
      Name  Age Department  Salary
0    Alice   25         HR   50000
1      Bob   30         IT   65000
2  Charlie   35    Finance   70000

Last 2 rows:
    Name  Age Department  Salary
3  Diana   28         IT   58000
4    Eve   32         HR   52000

All names:
0      Alice
1        Bob
2    Charlie
3      Diana
4        Eve
Name: Name, dtype: object

Multiple columns:
      Name  Salary
0    Alice   50000
1      Bob   65000
2  Charlie   70000
3    Diana   58000
4      Eve   52000

First row:
Name          Alice
Age              25
Department       HR
Salary        50000
Name: 0, dtype: object

Multiple rows:
      Name  Age Department  Salary
0    Alice   25         HR   50000
1      Bob   30         IT   65000
2  Charlie   35    Finance   70000


### Practical Exercise 2

In [21]:
# Creating a DataFrame of my favorite movies
movies = {
    'Title': ['The Matrix', 'Inception', 'Interstellar', 'Pulp Fiction', 'The Godfather'],
    'Year': [1999, 2010, 2014, 1994, 1972],
    'Genre': ['Sci-Fi', 'Sci-Fi', 'Sci-Fi', 'Crime', 'Crime'],
    'Rating': [8.7, 8.8, 8.6, 8.9, 9.2],
    'Duration': [136, 148, 169, 154, 175]
}

movie_df = pd.DataFrame(movies)

# Exploring the data
print("Movie database:")
print(movie_df)

print("\nBasic info:")
print(f"Total movies: {len(movie_df)}")
print(f"Average rating: {movie_df['Rating'].mean():.1f}")
print(f"Longest movie: {movie_df['Duration'].max()} minutes")

# Finding movies longer than 150 minutes
long_movies = movie_df[movie_df['Duration'] > 150]
print("\nMovies longer than 150 minutes:")
print(long_movies[['Title', 'Duration']])

Movie database:
           Title  Year   Genre  Rating  Duration
0     The Matrix  1999  Sci-Fi     8.7       136
1      Inception  2010  Sci-Fi     8.8       148
2   Interstellar  2014  Sci-Fi     8.6       169
3   Pulp Fiction  1994   Crime     8.9       154
4  The Godfather  1972   Crime     9.2       175

Basic info:
Total movies: 5
Average rating: 8.8
Longest movie: 175 minutes

Movies longer than 150 minutes:
           Title  Duration
2   Interstellar       169
3   Pulp Fiction       154
4  The Godfather       175
