# Python - Pandas

### Basics

In [7]:
import pandas as pd

In [8]:
s = pd.Series([1, 3, 5, 7, 9])
print(s)

0    1
1    3
2    5
3    7
4    9
dtype: int64


In [9]:
s = pd.Series([1, 3, 5, 7, 9], index=['a', 'b', 'c', 'd', 'e'])
print(s)

a    1
b    3
c    5
d    7
e    9
dtype: int64


In [10]:
print(s['a'])  
print(s[0])    

1
1


  print(s[0])


In [11]:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'Salary': [50000, 60000, 70000]}
df = pd.DataFrame(data)
print(df)
data = [{'Name': 'Alice', 'Age': 25}, {'Name': 'Bob', 'Age': 30}]
df = pd.DataFrame(data)
print(df)


      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
    Name  Age
0  Alice   25
1    Bob   30


In [12]:
print(df['Name'])

print(df[['Name', 'Age']])

print(df.iloc[1])  # By index position
print(df.loc[0])   # By index label


0    Alice
1      Bob
Name: Name, dtype: object
    Name  Age
0  Alice   25
1    Bob   30
Name    Bob
Age      30
Name: 1, dtype: object
Name    Alice
Age        25
Name: 0, dtype: object


In [14]:
df = pd.DataFrame({'A': [1, 2, None], 'B': [4, None, 6]})
print(df.isnull())
print(df.dropna())  # Drops rows with any missing values
print(df.fillna(0))  # Fills missing values with 0


       A      B
0  False  False
1  False   True
2   True  False
     A    B
0  1.0  4.0
     A    B
0  1.0  4.0
1  2.0  0.0
2  0.0  6.0


In [16]:
df['C'] = df['A'] + df['B']
print(df.fillna("Yashdeep"))

          A         B         C
0       1.0       4.0       5.0
1       2.0  Yashdeep  Yashdeep
2  Yashdeep       6.0  Yashdeep


In [18]:
df = df.rename(columns={'A': 'Alpha', 'B': 'Beta','C':"Celta"})
print(df)

   Alpha  Beta  Celta
0    1.0   4.0    5.0
1    2.0   NaN    NaN
2    NaN   6.0    NaN


In [22]:
df['Alpha'] = df['Alpha'].astype('float')
print(df.dtypes)

Alpha    float64
Beta     float64
Celta    float64
dtype: object


In [23]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                   'Age': [25, 30, 35, 40]})
print(df[df['Age'] > 30])

      Name  Age
2  Charlie   35
3    David   40


In [24]:
df = df.set_index('Name')
print(df)
df = df.reset_index()
print(df)

         Age
Name        
Alice     25
Bob       30
Charlie   35
David     40
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [37]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Alice', 'Bob'],
                   'Year': [2020, 2020, 2021, 2021],
                   'Score': [85, 88, 90, 92]})
grouped = df.groupby('Name')
print(grouped['Score'].mean())
print(grouped.agg({'Score': ['mean', 'max']}))


Name
Alice    87.5
Bob      90.0
Name: Score, dtype: float64
      Score    
       mean max
Name           
Alice  87.5  90
Bob    90.0  92


In [29]:
df1 = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [25, 30]})
df2 = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Salary': [50000, 60000]})
merged = pd.merge(df1, df2, on='Name')
print(merged)

    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000


In [30]:
df1 = df1.set_index('Name')
df2 = df2.set_index('Name')
joined = df1.join(df2)
print(joined)

       Age  Salary
Name              
Alice   25   50000
Bob     30   60000


In [34]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                   'Year': [2020, 2020, 2021, 2021],
                   'Score': [85, 88, 90, 92]})
pivoted = df.pivot(index='Name', columns='Year', values='Score')
print(pivoted.fillna(0))

Year     2020  2021
Name               
Alice    85.0   0.0
Bob      88.0   0.0
Charlie   0.0  90.0
David     0.0  92.0


In [35]:
stacked = pivoted.stack()
print(stacked)

unstacked = stacked.unstack()
print(unstacked)

Name     Year
Alice    2020    85.0
Bob      2020    88.0
Charlie  2021    90.0
David    2021    92.0
dtype: float64
Year     2020  2021
Name               
Alice    85.0   NaN
Bob      88.0   NaN
Charlie   NaN  90.0
David     NaN  92.0


In [36]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                   'Age': [25, 30, 35],
                   'Salary': [50000, 60000, 70000]})
print(df.describe())  # Summary statistics
print(df['Age'].mean())  # Mean of the 'Age' column

        Age   Salary
count   3.0      3.0
mean   30.0  60000.0
std     5.0  10000.0
min    25.0  50000.0
25%    27.5  55000.0
50%    30.0  60000.0
75%    32.5  65000.0
max    35.0  70000.0
30.0


In [41]:
import numpy as np
dates = pd.date_range('20240801', periods=6)
df = pd.DataFrame({'Date': dates, 'Value': np.random.randn(6)})
df['Month'] = df['Date'].dt.month
print(df)


        Date     Value  Month
0 2024-08-01  1.444140      8
1 2024-08-02  1.121749      8
2 2024-08-03  2.927914      8
3 2024-08-04 -0.893168      8
4 2024-08-05 -0.270794      8
5 2024-08-06 -0.005308      8


In [47]:
#df = df.to_csv('data.csv')
df = pd.read_csv('data.csv')
#df = pd.read_sql('SELECT * FROM table_name', connection)
df

Unnamed: 0.1,Unnamed: 0,Date,Value,Month
0,0,2024-08-01,1.44414,8
1,1,2024-08-02,1.121749,8
2,2,2024-08-03,2.927914,8
3,3,2024-08-04,-0.893168,8
4,4,2024-08-05,-0.270794,8
5,5,2024-08-06,-0.005308,8


In [49]:
tips = pd.read_csv('tips.csv')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [54]:
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [55]:
tips.tail(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
234,15.53,3.0,Male,Yes,Sat,Dinner,2
235,10.07,1.25,Male,No,Sat,Dinner,2
236,12.6,1.0,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [64]:
df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 15)})
result = df.eval('C = A + B')
result2 = result.copy()
result

Unnamed: 0,A,B,C
0,1,10,11
1,2,11,13
2,3,12,15
3,4,13,17
4,5,14,19


In [66]:
print("Result 2....................................................")
result2.T

Result 2....................................................


Unnamed: 0,0,1,2,3,4
A,1,2,3,4,5
B,10,11,12,13,14
C,11,13,15,17,19


### Benifits of Pandas

##### 1.Data Handling: Pandas provides efficient tools for data cleaning, preparation, and manipulation.
##### 2.Flexibility: It supports a wide range of data types and operations, including time series.
##### 3.Integration: Pandas integrates seamlessly with other libraries like NumPy, Matplotlib, and scikit-learn.
##### 4.Performance: Pandas is built on top of NumPy, providing efficient data operations.
##### 5.Data Analysis: It offers comprehensive tools for exploratory data analysis and statistical modeling.

### Comparison
##### 1.Speed: Pandas DataFrames are optimized for speed, especially with large datasets.
##### 2.Memory Efficiency: Pandas is more memory-efficient than lists and dictionaries when handling large datasets.
##### 3.Functionality: Unlike lists and dictionaries, Pandas provides built-in methods for data aggregation, merging, filtering, and visualization.
##### 4.Indexing: Pandas provides more powerful indexing and selection capabilities compared to lists and dictionaries.