## pandas
- Used for Exploratory Data Analysis
- used for importing/exporting data, data cleaning, creating/deleting columns, etc


In [39]:
# make sure you have pandas installed
# !pip install pandas

In [40]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

# print(pd.__version__)
data_file = 'data.csv' # NOTE: make sure you have this file

In [41]:
# (SKIP)
# Create a DataFrame: Example2 hardcode the data using series
data = {
    'A': pd.Series(pd.date_range("2023-01-01", periods=5, freq='D')),  # Time data
    'B': pd.Series([120.5, 123.0, 121.3, 125.6, 124.2]),
    'C': pd.Series(['Buy', 'Sell', 'Hold', 'Buy', 'Sell']),
    'D': pd.Series(np.random.randint(1, 11, size=5))
}
df = pd.DataFrame(data)

print(df)

############################
# (SKIP)
# Create a DataFrame: Example 3 hardcode the data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen', 'Helen', 'Helen', 'Jerry'],
    'Age': [25, 30, 35, 40, 22, 28, 32, 26, 26, 26, 23],
    'City': ['Chicago', 'Los Angeles', 'Chicago', 'Houston', 'Houston', np.nan, 'San Antonio', 'San Diego', 'San Diego', 'San Diego', 'Phoenix'],
    'Experience':  [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Experience2': [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Salary': [70000.0, 80000.0, np.nan, 90000.0, 48000.0, 72000.0, 85000.0, 62000.0, 62000.0, 62000.0, 78000.0]
}

df = pd.DataFrame(data)
print(df)

print("####################")
# Lets look at each column series
series_age = df['Age']
print(type(series_age)) # <class 'pandas.core.series.Series'>
print(series_age)

####################
# (SKIP) 
# You can also generate DF
row_labels = ['bob', 'sam', 'hari']
column_headings = ['age', 'income']
data = np.random.randint(10, 100, size=(len(row_labels), len(column_headings)))

df = pd.DataFrame(data, index=row_labels, columns=column_headings)
print(f"Generated DataFrame:\n{df}")

           A      B     C  D
0 2023-01-01  120.5   Buy  3
1 2023-01-02  123.0  Sell  9
2 2023-01-03  121.3  Hold  7
3 2023-01-04  125.6   Buy  9
4 2023-01-05  124.2  Sell  6
       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0
####################
<class 'pandas.core.series.Series'>
0

## Dataframe

### export / import data from file 

In [42]:
# Create a DataFrame and export the data to a file
data = {
    'name':       ['ash', 'timmy', 'jimmy', 'Samantha'],
    'age':        [30,     33,       61,       19],
    'is_working': [True,   False,   True,     True],
    'income':     [20.4,   20.1,     34.7,     55.9] 
}
df = pd.DataFrame(data)

print(df)
print(type(df))

########################

# different ways to export data to file
df.to_csv('junk.csv', index=False) # default sep=','
# df.to_csv('junk.csv', sep=';', index=False)

# df.to_excel('junk.xlsx', index=False, sheet_name='my_sheet1')

# df.to_json('junk.json', orient='records', lines=True)

# df.to_pickle('junk.pkl')

#########################
# SKIP following
# df.to_sql('my_table', conn, if_exists='replace', index=False)  # Requires SQLAlchemy or sqlite3

       name  age  is_working  income
0       ash   30        True    20.4
1     timmy   33       False    20.1
2     jimmy   61        True    34.7
3  Samantha   19        True    55.9
<class 'pandas.core.frame.DataFrame'>


In [43]:
# I can import from CSV with comma separator
# from pathlib import Path
# data_file = Path.cwd() / 'data.csv'

data_file = 'data.csv'
df = pd.read_csv(data_file) # default sep=','

print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80


In [44]:
# different ways to import from local computer

# read from CSV file
df = pd.read_csv('data_semi_colon.txt', sep=';') # file has , in the data
# df = pd.read_csv("data_semi-colon-california_house.csv", sep=';')
print(df)

##############
# read from excel file
# df = pd.read_excel('data.xlsx', sheet_name='Sheet1')  # Requires openpyxl or xlrd
# print(df)

################
# read from JSON file
# df = pd.read_json('data.json',  lines=True)
# print(df)

######################
# read from pickle file
# df = pd.read_pickle('data.pkl')
# print(df)

                 name  age place of birth
0           Ash,Kumar   23           Moon
1      Sameer, Khosla   32    HR, Haryana
2         Khan, Rahim   34    NY, NewYork
3  Shamlodhiya, Kumar   20    NY, NewYork


In [45]:
# SKIP
# you can also read from internet
# make sure its LEGAL

# df = pd.read_csv('https://raw.githubusercontent.com/ash322ash422/tut_pandas_numpy/refs/heads/master/titanic.csv', sep=',')
# print(df.head(5))


# url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))


# url = 'https://en.wikipedia.org/wiki/Minnesota'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))


### Following skip for now############
# import sqlite3
# conn = sqlite3.connect('my_database.db')
# df = pd.read_sql('SELECT * FROM my_table', conn)


In [46]:
# displaying the data
data_file = 'data.csv'
df = pd.read_csv(data_file)

print(df)
print(df.head(3)) # 1st 3 rows
print(df.tail(3)) # last 3 rows

print(df.sample(n=3))                  # Sample 3 random rows
print(df.sample(frac=0.2))             # Sample 20% of the rows
print(df.sample(n=5, replace=True))    # Sample with replacement, so duplicates can appear
print(df.sample(n=3, random_state=42)) # Sample 3 random rows with consistent output each time it runs

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
      Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0    Alice   25      Chicago    2     2  70000.0    

# STOP