#  Basic Data Structures

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [9]:
s = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])
print(s)

a    1
b    3
c    5
d    7
dtype: int64


In [12]:

# From a dictionary
d = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(d)
print(s)

a    1
b    2
c    3
dtype: int64


In [14]:


# With a scalar value
s = pd.Series(5, index=['a', 'b', 'c'])
print(s)

a    5
b    5
c    5
dtype: int64


In [16]:
# From a dictionary of Series
d = {'col1': pd.Series([1, 2, 3]), 
     'col2': pd.Series([4, 5, 6])}
df = pd.DataFrame(d)
print(df)

# From a dictionary of lists
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 34, 29, 42],
        'City': ['New York', 'Paris', 'Berlin', 'London']}
df = pd.DataFrame(data)
print(df)


# With specific columns and index
df = pd.DataFrame(data, columns=['Name', 'Age'], index=['a', 'b', 'c', 'd'])
print(df)

# From a list of dictionaries
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print(df)

# From a NumPy array
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print(df)

   col1  col2
0     1     4
1     2     5
2     3     6
    Name  Age      City
0   John   28  New York
1   Anna   34     Paris
2  Peter   29    Berlin
3  Linda   42    London
    Name  Age
a   John   28
b   Anna   34
c  Peter   29
d  Linda   42
   a   b     c
0  1   2   NaN
1  5  10  20.0
          A         B         C
0 -1.585783 -0.935098 -0.327775
1 -0.308338  0.184018 -1.130936
2 -2.609700 -0.113127  1.021855
3  1.767210 -1.624709 -0.008958


In [22]:
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print(df)
print (df.describe())

          A         B         C
0 -0.058097  0.761300  1.045502
1  0.591813  0.609975 -0.345646
2 -1.105658 -0.731811 -2.019276
3  0.338289  1.188959  0.835259
              A         B         C
count  4.000000  4.000000  4.000000
mean  -0.058413  0.457106 -0.121040
std    0.747638  0.829666  1.405830
min   -1.105658 -0.731811 -2.019276
25%   -0.319987  0.274528 -0.764054
50%    0.140096  0.685637  0.244806
75%    0.401670  0.868215  0.887819
max    0.591813  1.188959  1.045502


In [None]:
# Sample DataFrame
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [50000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)
print(df)
# Select a single column (returns Series)
ages = df['Age']
print(ages)

# Select multiple columns (returns DataFrame)
subset = df[['Name', 'Salary']]
print(subset)

# Select rows by position using iloc
first_row = df.iloc[0]  # First row
print(first_row)

specific_cells = df.iloc[1:3, 0:2]  # Rows 1-2, columns 0-1
print(specific_cells)

# Select rows and columns by label using loc
row_by_index = df.loc[2]  # Row with index 2
print(row_by_index)

# If you set a custom index
df.set_index('Name', inplace=True)
peter_data = df.loc['Peter']
print(peter_data)

# Filtering with boolean conditions
adults = df[df['Age'] > 30]
print(adults)

high_salary_london = df[(df['Salary'] > 60000) & (df['City'] == 'London')]
print(high_salary_london)

    Name  Age      City  Salary
0   John   28  New York   50000
1   Anna   34     Paris   60000
2  Peter   29    Berlin   55000
3  Linda   42    London   70000
0    28
1    34
2    29
3    42
Name: Age, dtype: int64
    Name  Salary
0   John   50000
1   Anna   60000
2  Peter   55000
3  Linda   70000
used iloc here Name       Peter
Age           29
City      Berlin
Salary     55000
Name: 2, dtype: object
    Name  Age
1   Anna   34
2  Peter   29
Name       Peter
Age           29
City      Berlin
Salary     55000
Name: 2, dtype: object
Age           29
City      Berlin
Salary     55000
Name: Peter, dtype: object
       Age    City  Salary
Name                      
Anna    34   Paris   60000
Linda   42  London   70000
       Age    City  Salary
Name                      
Linda   42  London   70000


In [12]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [50000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)
# Add a new column
df['Experience'] = [3, 8, 5, 15]
print(df)

#Aternative way to add a column
# df.insert(1, 'Department', ['HR', 'Finance', 'Engineering', 'Marketing'])
# print(df)

# Update a value
df.at[0, 'Age'] = 29
print(df)

# Update multiple values with loc
df.loc[1, 'Salary'] = 65000
print(df)

# Update based on a condition
df.loc[df['Age'] > 30, 'Salary'] *= 1.1  # 10% raise for people over 30
print(df)

# Delete a column
df_without_city = df.drop('City', axis=1)
print("df without city ", df_without_city)

# Delete a column in-place
df.drop('Experience', axis=1, inplace=True)
print("drop experience axis 1",df)

# Delete rows
df_without_first = df.drop(0)  # Drop first row
print(df_without_first)

# Delete rows by index labels
# df.drop(['John', 'Anna'], inplace=True)
# print(df)

# Reset index
df_reset = df.reset_index()
print(df_reset)

    Name  Age      City  Salary  Experience
0   John   28  New York   50000           3
1   Anna   34     Paris   60000           8
2  Peter   29    Berlin   55000           5
3  Linda   42    London   70000          15
    Name  Age      City  Salary  Experience
0   John   29  New York   50000           3
1   Anna   34     Paris   60000           8
2  Peter   29    Berlin   55000           5
3  Linda   42    London   70000          15
    Name  Age      City  Salary  Experience
0   John   29  New York   50000           3
1   Anna   34     Paris   65000           8
2  Peter   29    Berlin   55000           5
3  Linda   42    London   70000          15
    Name  Age      City  Salary  Experience
0   John   29  New York   50000           3
1   Anna   34     Paris   71500           8
2  Peter   29    Berlin   55000           5
3  Linda   42    London   77000          15
df without city      Name  Age  Salary  Experience
0   John   29   50000           3
1   Anna   34   71500           8
2

In [13]:
# Sort by values in a column
df_sorted_age = df.sort_values('Age')
print(df_sorted_age)

# Sort by multiple columns
df_sorted_multiple = df.sort_values(['City', 'Salary'], ascending=[True, False])
print(df_sorted_multiple)

# Sort by index
df_sorted_index = df.sort_index()
print(df_sorted_index)

# Sort a Series
ages_sorted = df['Age'].sort_values()
print(ages_sorted)

    Name  Age      City  Salary
0   John   29  New York   50000
2  Peter   29    Berlin   55000
1   Anna   34     Paris   71500
3  Linda   42    London   77000
    Name  Age      City  Salary
2  Peter   29    Berlin   55000
3  Linda   42    London   77000
0   John   29  New York   50000
1   Anna   34     Paris   71500
    Name  Age      City  Salary
0   John   29  New York   50000
1   Anna   34     Paris   71500
2  Peter   29    Berlin   55000
3  Linda   42    London   77000
0    29
2    29
1    34
3    42
Name: Age, dtype: int64
