In [25]:
import numpy as np
import pandas as pd
print('NumPy:', np.__version__, '| Pandas:', pd.__version__)

NumPy: 2.2.6 | Pandas: 2.3.3


In [26]:
arr = np.array([10, 20, 30, 40, 50])
print('Array:', arr)
print('Shape:', arr.shape, '| dtype:', arr.dtype)
print('Mean:', np.mean(arr), '| Std:', np.std(arr))
print('Min:', np.min(arr), '| Max:', np.max(arr))

Array: [10 20 30 40 50]
Shape: (5,) | dtype: int64
Mean: 30.0 | Std: 14.142135623730951
Min: 10 | Max: 50


In [27]:
matrix = np.arange(1, 10).reshape(3, 3)
print('Matrix:\n', matrix)
print('Row sums:', np.sum(matrix, axis=1))
print('Col means:', np.mean(matrix, axis=0))
print('Sorted flat:', np.sort(matrix.flatten()))

Matrix:
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
Row sums: [ 6 15 24]
Col means: [4. 5. 6.]
Sorted flat: [1 2 3 4 5 6 7 8 9]


In [28]:

s = pd.Series([10, 20, 30, 40, 50])
print('Series:\n', s)

s2 = pd.Series([85, 92, 78, 95], index=['Math', 'Science', 'English', 'History'])
print('\nMarks Series:\n', s2)
print('Math mark:', s2['Math'])
print('Mean:', s2.mean())

Series:
 0    10
1    20
2    30
3    40
4    50
dtype: int64

Marks Series:
 Math       85
Science    92
English    78
History    95
dtype: int64
Math mark: 85
Mean: 87.5


In [29]:

arr = np.random.randint(50, 100, 5)
s = pd.Series(arr, index=['A','B','C','D','E'])
print('From numpy array:\n', s)
print('dtype:', s.dtype)
print('values:', s.values)  
print('index:', s.index.tolist())

From numpy array:
 A    84
B    56
C    70
D    82
E    72
dtype: int32
dtype: int32
values: [84 56 70 82 72]
index: ['A', 'B', 'C', 'D', 'E']


In [30]:

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['Delhi', 'Mumbai', 'Chennai', 'Kolkata', 'Pune'],
    'Salary': [50000, 60000, 75000, 55000, 45000],
    'Score': [85, 92, 78, 88, 95]
}
df = pd.DataFrame(data)
print(df)

      Name  Age     City  Salary  Score
0    Alice   25    Delhi   50000     85
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78
3    Diana   28  Kolkata   55000     88
4      Eve   22     Pune   45000     95


In [31]:

arr = np.random.randint(1, 100, size=(4, 3))
df_np = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print('DataFrame from NumPy:\n', df_np)

DataFrame from NumPy:
     A   B   C
0  81  60  92
1  37  76  40
2  53  39  24
3  13  58  19


In [32]:
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
print('dtypes:\n', df.dtypes)
print('\nFirst 3 rows:\n', df.head(3))
print('\nLast 2 rows:\n', df.tail(2))
print('\nInfo:')
df.info()
print('\nDescribe:\n', df.describe())

Shape: (5, 5)
Columns: ['Name', 'Age', 'City', 'Salary', 'Score']
dtypes:
 Name      object
Age        int64
City      object
Salary     int64
Score      int64
dtype: object

First 3 rows:
       Name  Age     City  Salary  Score
0    Alice   25    Delhi   50000     85
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78

Last 2 rows:
     Name  Age     City  Salary  Score
3  Diana   28  Kolkata   55000     88
4    Eve   22     Pune   45000     95

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
 4   Score   5 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 328.0+ bytes

Describe:
              Age        Salary      Score
count   5.000000      5.000000   5.000000
mean   28.0

In [33]:

print('Name column:\n', df['Name'])
print('\nMultiple cols:\n', df[['Name', 'Salary']])


print('\nloc row 0:', df.loc[0])
print('\nloc rows 1-3:', df.loc[1:3, ['Name','Age','Salary']])

print('\niloc row 2:', df.iloc[2])
print('\niloc rows 0-2, cols 0-2:\n', df.iloc[0:3, 0:3])

Name column:
 0      Alice
1        Bob
2    Charlie
3      Diana
4        Eve
Name: Name, dtype: object

Multiple cols:
       Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   75000
3    Diana   55000
4      Eve   45000

loc row 0: Name      Alice
Age          25
City      Delhi
Salary    50000
Score        85
Name: 0, dtype: object

loc rows 1-3:       Name  Age  Salary
1      Bob   30   60000
2  Charlie   35   75000
3    Diana   28   55000

iloc row 2: Name      Charlie
Age            35
City      Chennai
Salary      75000
Score          78
Name: 2, dtype: object

iloc rows 0-2, cols 0-2:
       Name  Age     City
0    Alice   25    Delhi
1      Bob   30   Mumbai
2  Charlie   35  Chennai


In [34]:

print('Age > 27:\n', df[df['Age'] > 27])
print('\nSalary > 55000:\n', df[df['Salary'] > 55000])
print('\nAge > 25 AND Score > 85:\n', df[(df['Age'] > 25) & (df['Score'] > 85)])
print('\nCity is Delhi or Mumbai:\n', df[df['City'].isin(['Delhi', 'Mumbai'])])

Age > 27:
       Name  Age     City  Salary  Score
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78
3    Diana   28  Kolkata   55000     88

Salary > 55000:
       Name  Age     City  Salary  Score
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78

Age > 25 AND Score > 85:
     Name  Age     City  Salary  Score
1    Bob   30   Mumbai   60000     92
3  Diana   28  Kolkata   55000     88

City is Delhi or Mumbai:
     Name  Age    City  Salary  Score
0  Alice   25   Delhi   50000     85
1    Bob   30  Mumbai   60000     92


In [35]:
df2 = df.copy()

df2['Bonus'] = df2['Salary'] * 0.10
print('With Bonus column:\n', df2[['Name','Salary','Bonus']])


df2['Salary_log'] = np.log(df2['Salary'])
print('\nWith log salary:\n', df2[['Name','Salary','Salary_log']])


df2 = df2.drop(columns=['Salary_log'])
print('\nAfter dropping column:', df2.columns.tolist())

With Bonus column:
       Name  Salary   Bonus
0    Alice   50000  5000.0
1      Bob   60000  6000.0
2  Charlie   75000  7500.0
3    Diana   55000  5500.0
4      Eve   45000  4500.0

With log salary:
       Name  Salary  Salary_log
0    Alice   50000   10.819778
1      Bob   60000   11.002100
2  Charlie   75000   11.225243
3    Diana   55000   10.915088
4      Eve   45000   10.714418

After dropping column: ['Name', 'Age', 'City', 'Salary', 'Score', 'Bonus']


In [36]:
df_miss = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, 2, 3, 4, 5]
})
print('DataFrame with NaN:\n', df_miss)
print('\nNull check:\n', df_miss.isnull())
print('\nNull counts:\n', df_miss.isnull().sum())
print('\nFill with 0:\n', df_miss.fillna(0))
print('\nFill with mean:\n', df_miss.fillna(df_miss.mean(numeric_only=True)))
print('\nDrop NaN rows:\n', df_miss.dropna())
print('\nDrop NaN cols:\n', df_miss.dropna(axis=1))

DataFrame with NaN:
      A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  NaN  3.0  3
3  4.0  NaN  4
4  5.0  5.0  5

Null check:
        A      B      C
0  False   True  False
1  False  False  False
2   True  False  False
3  False   True  False
4  False  False  False

Null counts:
 A    1
B    2
C    0
dtype: int64

Fill with 0:
      A    B  C
0  1.0  0.0  1
1  2.0  2.0  2
2  0.0  3.0  3
3  4.0  0.0  4
4  5.0  5.0  5

Fill with mean:
      A         B  C
0  1.0  3.333333  1
1  2.0  2.000000  2
2  3.0  3.000000  3
3  4.0  3.333333  4
4  5.0  5.000000  5

Drop NaN rows:
      A    B  C
1  2.0  2.0  2
4  5.0  5.0  5

Drop NaN cols:
    C
0  1
1  2
2  3
3  4
4  5


In [37]:
print('Sort by Salary (desc):\n', df.sort_values('Salary', ascending=False))
print('\nSort by Age then Score:\n', df.sort_values(['Age','Score'], ascending=[True, False]))
print('\nSort by index desc:\n', df.sort_index(ascending=False))

Sort by Salary (desc):
       Name  Age     City  Salary  Score
2  Charlie   35  Chennai   75000     78
1      Bob   30   Mumbai   60000     92
3    Diana   28  Kolkata   55000     88
0    Alice   25    Delhi   50000     85
4      Eve   22     Pune   45000     95

Sort by Age then Score:
       Name  Age     City  Salary  Score
4      Eve   22     Pune   45000     95
0    Alice   25    Delhi   50000     85
3    Diana   28  Kolkata   55000     88
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78

Sort by index desc:
       Name  Age     City  Salary  Score
4      Eve   22     Pune   45000     95
3    Diana   28  Kolkata   55000     88
2  Charlie   35  Chennai   75000     78
1      Bob   30   Mumbai   60000     92
0    Alice   25    Delhi   50000     85


In [38]:
data2 = {
    'Department': ['HR','IT','IT','HR','Finance','Finance','IT'],
    'Employee': ['A','B','C','D','E','F','G'],
    'Salary': [40000,80000,75000,45000,70000,65000,90000],
    'Experience': [2,5,4,3,6,5,7]
}
df3 = pd.DataFrame(data2)
print(df3)
print('\nMean salary by dept:\n', df3.groupby('Department')['Salary'].mean())
print('\nMultiple agg:\n', df3.groupby('Department').agg({'Salary':['mean','max','min'], 'Experience':'mean'}))

  Department Employee  Salary  Experience
0         HR        A   40000           2
1         IT        B   80000           5
2         IT        C   75000           4
3         HR        D   45000           3
4    Finance        E   70000           6
5    Finance        F   65000           5
6         IT        G   90000           7

Mean salary by dept:
 Department
Finance    67500.000000
HR         42500.000000
IT         81666.666667
Name: Salary, dtype: float64

Multiple agg:
                   Salary               Experience
                    mean    max    min       mean
Department                                       
Finance     67500.000000  70000  65000   5.500000
HR          42500.000000  45000  40000   2.500000
IT          81666.666667  90000  75000   5.333333


In [39]:
df_a = pd.DataFrame({'ID': [1,2,3,4], 'Name': ['Alice','Bob','Charlie','Diana']})
df_b = pd.DataFrame({'ID': [1,2,3,5], 'Score': [85,92,78,88]})

print('Inner join:\n', pd.merge(df_a, df_b, on='ID', how='inner'))
print('\nLeft join:\n', pd.merge(df_a, df_b, on='ID', how='left'))
print('\nOuter join:\n', pd.merge(df_a, df_b, on='ID', how='outer'))

df_top = pd.DataFrame({'A': [1,2], 'B': [3,4]})
df_bot = pd.DataFrame({'A': [5,6], 'B': [7,8]})
print('\nConcat rows:\n', pd.concat([df_top, df_bot], ignore_index=True))

Inner join:
    ID     Name  Score
0   1    Alice     85
1   2      Bob     92
2   3  Charlie     78

Left join:
    ID     Name  Score
0   1    Alice   85.0
1   2      Bob   92.0
2   3  Charlie   78.0
3   4    Diana    NaN

Outer join:
    ID     Name  Score
0   1    Alice   85.0
1   2      Bob   92.0
2   3  Charlie   78.0
3   4    Diana    NaN
4   5      NaN   88.0

Concat rows:
    A  B
0  1  3
1  2  4
2  5  7
3  6  8


In [40]:
df2 = df.copy()
df2['Name_upper'] = df2['Name'].apply(lambda x: x.upper())
df2['Salary_category'] = df2['Salary'].apply(lambda x: 'High' if x > 60000 else 'Low')

df2['Salary_sqrt'] = df2['Salary'].apply(np.sqrt)

df2['Combined'] = df2.apply(lambda row: f"{row['Name']} - {row['City']}", axis=1)
print(df2[['Name','Name_upper','Salary','Salary_category','Combined']])

      Name Name_upper  Salary Salary_category           Combined
0    Alice      ALICE   50000             Low      Alice - Delhi
1      Bob        BOB   60000             Low       Bob - Mumbai
2  Charlie    CHARLIE   75000            High  Charlie - Chennai
3    Diana      DIANA   55000             Low    Diana - Kolkata
4      Eve        EVE   45000             Low         Eve - Pune


In [41]:
data4 = {
    'Region': ['North','South','North','South','East','East'],
    'Product': ['A','A','B','B','A','B'],
    'Sales': [100,150,200,120,180,160]
}
df4 = pd.DataFrame(data4)
pivot = df4.pivot_table(values='Sales', index='Region', columns='Product', aggfunc='sum')
print('Pivot table:\n', pivot)

print('\nAs numpy array:\n', pivot.values)

Pivot table:
 Product    A    B
Region           
East     180  160
North    100  200
South    150  120

As numpy array:
 [[180 160]
 [100 200]
 [150 120]]


In [42]:
df2 = df.copy()
print('Uppercase:', df2['Name'].str.upper().tolist())
print('Lowercase:', df2['City'].str.lower().tolist())
print('Length:', df2['Name'].str.len().tolist())
print('Contains a:', df2['Name'].str.contains('a', case=False).tolist())
print('Starts with A:', df2['Name'].str.startswith('A').tolist())
print('Replace i->I:', df2['City'].str.replace('i','I').tolist())

Uppercase: ['ALICE', 'BOB', 'CHARLIE', 'DIANA', 'EVE']
Lowercase: ['delhi', 'mumbai', 'chennai', 'kolkata', 'pune']
Length: [5, 3, 7, 5, 3]
Contains a: [True, False, True, True, False]
Starts with A: [True, False, False, False, False]
Replace i->I: ['DelhI', 'MumbaI', 'ChennaI', 'Kolkata', 'Pune']


In [43]:
df2 = df.copy()
df2['Salary_norm'] = (df2['Salary'] - np.mean(df2['Salary'])) / np.std(df2['Salary'])
df2['Score_pct'] = np.percentile(df2['Score'], 75)

salary_array = df2['Salary'].to_numpy()
print('Salary as numpy array:', salary_array)
print('Numpy mean:', np.mean(salary_array))
print('Numpy std:', np.std(salary_array))

df2['Grade'] = np.where(df2['Score'] >= 90, 'A', np.where(df2['Score'] >= 80, 'B', 'C'))
print('\nWith Grade:\n', df2[['Name','Score','Grade','Salary_norm']])

Salary as numpy array: [50000 60000 75000 55000 45000]
Numpy mean: 57000.0
Numpy std: 10295.630140987001

With Grade:
       Name  Score Grade  Salary_norm
0    Alice     85     B    -0.679900
1      Bob     92     A     0.291386
2  Charlie     78     C     1.748315
3    Diana     88     B    -0.194257
4      Eve     95     A    -1.165543


In [44]:

df.to_csv('data.csv', index=False)
df_loaded = pd.read_csv('data.csv')
print('Loaded from CSV:\n', df_loaded.head())

df.to_excel('data.xlsx', index=False)
df_xl = pd.read_excel('data.xlsx')
print('\nLoaded from Excel:\n', df_xl.head())

Loaded from CSV:
       Name  Age     City  Salary  Score
0    Alice   25    Delhi   50000     85
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78
3    Diana   28  Kolkata   55000     88
4      Eve   22     Pune   45000     95

Loaded from Excel:
       Name  Age     City  Salary  Score
0    Alice   25    Delhi   50000     85
1      Bob   30   Mumbai   60000     92
2  Charlie   35  Chennai   75000     78
3    Diana   28  Kolkata   55000     88
4      Eve   22     Pune   45000     95
