In [1]:
import pandas as pd

In [2]:
#Pandas Series (i.e. 1 dimensional)
a = [1, 7, 2]
myvar = pd.Series(a, index = ['x','y','z']) #index change labels from default index values
print(myvar)
print(myvar[1])
print(myvar['y'])

calories = {"mon": 100, "tues": 200, "wed": 300} #for dictionary datasets, keys become labels
myvar = pd.Series(calories)
print(myvar)
myvar2 = pd.Series(calories, index = ['mon', 'tues'])
print(myvar2)

x    1
y    7
z    2
dtype: int64
7
7
mon     100
tues    200
wed     300
dtype: int64
mon     100
tues    200
dtype: int64


In [3]:
#Pandas Dataframe from a Python Dictionary (2d data structure like a table with rows and columns)
mydataset = {
    'car': ["Audi", "Cadillac", "Mercedes"],
    'high speed': [100,200,300]
}
mydata = pd.DataFrame(mydataset)
print(mydata)
print(mydata.loc[0]) #to print data from a row (does so in key-value pair format), returns a pandas series
print(mydata.loc[[0,1]]) #prints multiple rows, returns a dataframe

mydata = pd.DataFrame(mydataset, index=['first', 'second', 'third']) #for custom indexes
print(mydata)
print(mydata.loc[['second', 'third']]) #prints select rows

        car  high speed
0      Audi         100
1  Cadillac         200
2  Mercedes         300
car           Audi
high speed     100
Name: 0, dtype: object
        car  high speed
0      Audi         100
1  Cadillac         200
             car  high speed
first       Audi         100
second  Cadillac         200
third   Mercedes         300
             car  high speed
second  Cadillac         200
third   Mercedes         300


In [4]:
#Importing .CSV file to Dataframe
df_csv = pd.read_csv('exampledata4pandas.csv')
print(df_csv)
print(df_csv.to_string()) #prints the entire dataframe
print(pd.options.display.max_rows) #print(df) will only print first/last 5 rows if more than this number is exceeded
# pd.options.display.max_rows = any_number ... this statement can modify the number of rows displayed by default

FileNotFoundError: [Errno 2] No such file or directory: 'exampledata.csv'

In [None]:
#Importing JSON - for big data sets, written as plain text, formatted as dictionaries
data_json = pd.read_json('examplejson.json')
print(data_json.to_string())

In [None]:
#ex. python dictionary into dataframe
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}
df1 = pd.DataFrame(data)
print(df1)

#Overview of Data from Select Number of Rows from Top
print(df_csv.head(10))

#Select Number of Rows from Botton + Header
print(df_csv.tail(10))

#Info About DataFrame - also tells you if there are null values in a column 
print(df_csv.info())


In [None]:
#Data Cleaning - removing bad data (i.e. empty cells, wrong format, wrong data, duplicates, etc.)

# a) Removing Empty Values
tbcleaned_dataframe = pd.read_csv('dataset_to_clean.csv')
dataframe2 = tbcleaned_dataframe.dropna() #returns new dataframe without rows that had empty cells, not changing original
print(dataframe2) #if you used .dropna(inplace = True), would return original dataframe with rows containing empty cells removed

dataframe21 = tbcleaned_dataframe.fillna(1) #fills empty cells with specified value
print(dataframe21)

dataframe22 = tbcleaned_dataframe["Calories"].fillna(1) #only fills empty cells in specified column
print(dataframe22)

x = dataframe2["Calories"].mean() #use .mean() .median() or .mode()[0] and put in .fillna to use those values

# b) Wrong Format - convert to write format (ex. pd.to_datetime()) and remove rows will empty values

# c) Wrong Data
df_csv.loc[7,'Duration'] = 45 #for an error found ourself, can manually change
print(df_csv.to_string())

for x in df_csv.index:                  #for large datasets use loops (ex. seen here) to check for conditions
  if df_csv.loc[x, "Duration"] > 120:
    df_csv.loc[x, "Duration"] = 120

for x in df_csv.index:                  #same example but removing rows instead of modifying
  if df_csv.loc[x, "Duration"] > 120:
    df_csv.drop(x, inplace = True)

# d) Removing Duplicates
print(df_csv.duplicated().to_string()) #returns true for every row that is a duplicate
df_csv.drop_duplicates(inplace = True) #returns dataframe without duplicates

In [None]:
#Correlations
print(df_csv.corr()) #finds correlations between features, closer abs to |1| = correlations, closer to 0 = no correlation