# PANDAS: Topics Covered
### Creating / declaring dataframe
### Exporting / Importing data: csv, excel, json, pickle, database, etc
### Displaying data in  different ways

In [26]:
# make sure you have pandas installed
!pip install pandas
# !pip install --upgrade pandas




[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import pandas as pd # create an alias
print(pd.__version__)

2.3.3


## Dataframe

### Export data to file 

In [31]:
# Create a DataFrame and export the data to a file: 
# NOTE: To create a DF from dictionary all arrays must be of the same length
data = {
    'name':       ['Ash', 'Timmy', 'Jimmy', 'Samantha', 'Trybula'],
    'age':        [30,     33,       61,       19,         None  ],
    'is_working': [True,   False,   True,     True,        False ],
    'income':     [20.4,   20.1,     34.7,     55.9,       None ] 
}

df = pd.DataFrame(data)

print(df)
print(type(df))

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN
<class 'pandas.core.frame.DataFrame'>


In [35]:
# Different ways to export data to file
# SIDE NOTE: You can see the list of all options by tab completion -> df.to_<tab>

df.to_csv('employees.csv', index=False) # default sep=','
df.to_csv('employees_semi_colon.csv', index=False, sep=';')

df.to_json('employees.json', orient='records', lines=True)
df.to_pickle('employees.pkl') 

#### why pickle over CSV ?
- Pickle can store any Python object
- Pickling and unpickling a DataFrame is generally significantly faster

In [36]:
# Also can be exported to excel, but it requires that you have installed openpyxl
# !pip install openpyxl
df.to_excel('employees.xlsx', index=False, sheet_name='employees_sheet')

In [37]:
# write to database like sqlite3. 
import sqlite3

conn = sqlite3.connect('XYZ-Corporation.db')
msg = df.to_sql('employees', conn, if_exists='replace', index=False)  # Requires SQLAlchemy or sqlite3
print(msg)
conn.close() # good practice

5


In [38]:
# Now we import data
df = pd.read_csv("employees.csv") # default sep=','
print(df)
# NaN means Not a Number

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN


### Import data

In [39]:
# read from CSV file
df = pd.read_csv('employees_semi_colon.csv', sep=';') # file has , in the data
print(df)

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN


In [40]:
# read from excel file
df = pd.read_excel('employees.xlsx', sheet_name='employees_sheet')  # Requires openpyxl or xlrd
print(df)

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN


In [41]:
# read from JSON file
df = pd.read_json('employees.json',  lines=True)
print(df)

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN


In [42]:
# read from pickle file
df = pd.read_pickle('employees.pkl')
print(df)

       name   age  is_working  income
0       Ash  30.0        True    20.4
1     Timmy  33.0       False    20.1
2     Jimmy  61.0        True    34.7
3  Samantha  19.0        True    55.9
4   Trybula   NaN       False     NaN


In [43]:
# read from sqlite3 database
import sqlite3

conn = sqlite3.connect('XYZ-Corporation.db')
df = pd.read_sql('SELECT * FROM employees', conn)
print(df)
conn.close() # good practice

       name   age  is_working  income
0       Ash  30.0           1    20.4
1     Timmy  33.0           0    20.1
2     Jimmy  61.0           1    34.7
3  Samantha  19.0           1    55.9
4   Trybula   NaN           0     NaN


In [44]:
# you can also read from internet: make sure its LEGAL

df = pd.read_csv('https://raw.githubusercontent.com/ash322ash422/tut_pandas_numpy/refs/heads/master/tut_pandas/data_titanic.csv', sep=',')
print(df.head(5))


# url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))


# url = 'https://en.wikipedia.org/wiki/Minnesota'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


## Display dataframe

In [47]:
# displaying the data in different ways

# lets make the dataframe little larger: use concat()
df = pd.read_csv("employees.csv") # default sep=','
df = pd.concat([df, df, df], ignore_index=True)

# print(df)
# print(df.head(3)) # 1st 3 rows
print(df.tail(3)) # last 3 rows

        name   age  is_working  income
12     Jimmy  61.0        True    34.7
13  Samantha  19.0        True    55.9
14   Trybula   NaN       False     NaN


In [54]:
# print(df.sample(n=3))                  # Sample 3 random rows. Each run will ouput different df
# print(df.sample(frac=0.2))             # Sample 20% of the rows. Each run will ouput different df
print(df.sample(n=5, replace=True))    # Sample with replacement, so duplicates can appear
# print(df.sample(n=3, random_state=42)) # Sample 3 random rows. Each run will ouput same df

        name   age  is_working  income
12     Jimmy  61.0        True    34.7
13  Samantha  19.0        True    55.9
0        Ash  30.0        True    20.4
6      Timmy  33.0       False    20.1
6      Timmy  33.0       False    20.1


# STOP