##Titanic

In [1]:
!pip install psycopg2-binary



In [0]:
import psycopg2
import pandas as pd

In [3]:
#load the titanic data into df:
url = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/master/module2-sql-for-analysis/titanic.csv'
df = pd.read_csv(url)

#format age feature as integer 
df.Age = df.Age.astype('int')

#remove apostrophes from names column:
df['Name'] = df['Name'].str.replace(r"[\"\',]", '')

df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
Survived                   887 non-null int64
Pclass                     887 non-null int64
Name                       887 non-null object
Sex                        887 non-null object
Age                        887 non-null int64
Siblings/Spouses Aboard    887 non-null int64
Parents/Children Aboard    887 non-null int64
Fare                       887 non-null float64
dtypes: float64(1), int64(5), object(2)
memory usage: 55.5+ KB


In [0]:
#function to format the dataframe data into tuples for each row to insert
#into elephantsql

def gen_row_tuples(df):
  rows = [] 
  for idx, vals in df.iterrows():
    row = [idx]
    for val in vals.values:
      row.append(val)
    row = tuple(row)
    rows.append(row)
  return rows

In [6]:
#function call, assign to variable
titanic_row_tuples = gen_row_tuples(df)

#inspect first row to make sure its formatted correctly:
titanic_row_tuples[0]

(0, 0, 3, 'Mr. Owen Harris Braund', 'male', 22, 1, 0, 7.25)

In [0]:
#Connect to elephantSQL instance:

dbname = 'mhodwfds'
user = 'mhodwfds'
password = <redacted>
host = 'raja.db.elephantsql.com'

pg_conn = psycopg2.connect(dbname=dbname, user=user,
                           password=password, host=host)

pg_curs = pg_conn.cursor()

In [0]:
#clear the postgresql instance of the titanic table (if it exists):
pg_curs.execute("DROP TABLE IF EXISTS titanic")

#assign CREATE table statement to variable

create_titanic_table = """
CREATE TABLE titanic (
  index SERIAL PRIMARY KEY,
  survived INT,
  pclass INT,
  name VARCHAR(85),
  sex VARCHAR(10),
  age INT,
  sibs_spouse INT,
  par_child INT,
  fare REAL
)
"""

#execute the table creation in the postgresql instance
pg_curs.execute(create_titanic_table)

In [0]:
#iterate over each row in the tuples list and insert them into the titanic table:
for row in titanic_row_tuples:
  insert_row = "INSERT INTO titanic VALUES" + str(row)
  pg_curs.execute(insert_row)

pg_conn.commit()

In [10]:
#re-establish a cursor object:
pg_curs = pg_conn.cursor()

#query all the data from the titanic table
pg_curs.execute('SELECT * FROM titanic')

#assign the results to variable
pg_titanic = pg_curs.fetchall()

#inspect the first 5 rows of the table:
pg_titanic[:5]

[(0, 0, 3, 'Mr. Owen Harris Braund', 'male', 22, 1, 0, 7.25),
 (1,
  1,
  1,
  'Mrs. John Bradley (Florence Briggs Thayer) Cumings',
  'female',
  38,
  1,
  0,
  71.2833),
 (2, 1, 3, 'Miss. Laina Heikkinen', 'female', 26, 0, 0, 7.925),
 (3,
  1,
  1,
  'Mrs. Jacques Heath (Lily May Peel) Futrelle',
  'female',
  35,
  1,
  0,
  53.1),
 (4, 0, 3, 'Mr. William Henry Allen', 'male', 35, 0, 0, 8.05)]

##Assignment Queries:


- How many passengers survived, and how many died?
- How many passengers were in each class?
- How many passengers survived/died within each class?
- What was the average age of survivors vs nonsurvivors?
- What was the average age of each passenger class?
- What was the average fare by passenger class? By survival?
- How many siblings/spouses aboard on average, by passenger class? By survival?
- How many parents/children aboard on average, by passenger class? By survival?
- Do any passengers have the same name?
- (Bonus! Hard, may require pulling and processing with Python) How many married couples were aboard the Titanic? Assume that two people (one Mr. and one Mrs.) with the same last name and with at least 1 sibling/spouse aboard are a married couple.


In [11]:
#create a list containing all the questions:
questions = ['Q1: How many passengers survived, and how many died?\n(survived / count)\n',
             'Q2: How many passengers were in each class?\n(pclass / count)\n',
             'Q3: How many passengers survived/died within each class?\n(pclass / survived / count)\n',
             'Q4: What was the average age of survivors vs nonsurvivors?\n(survived / avg age)\n',
             'Q5: What was the average age of each passenger class?\n(pclass / avg age)\n',
             'Q6: What was the average fare by passenger class?\n(pclass / avg fare)\n', 
             'Q7: What was the average fare by survival?\n(survived / avg fare)\n',
             'Q8: How many siblings/spouses aboard on average, by passenger class?\n(pclass / avg siblings & spouses)\n', 
             'Q9: How many siblings/spouses aboard on average by survival?\n(survived / avg siblings & spouses)\n',
             'Q10: How many parents/children aboard on average, by passenger class?\n(pclass / avg parents & children)\n',
             'Q11: How many parents/children aboard on average by survival?\n(survived / avg parents & children)\n',
             'Q12: Do any passengers have the same name?']    

#create a list of corresponding SQL queries for each question:
queries = ["SELECT survived, COUNT(*) FROM titanic GROUP BY survived;", 
           "SELECT pclass, COUNT(*) FROM titanic GROUP BY pclass ORDER BY pclass;",
           "SELECT pclass, survived, COUNT(*) FROM titanic GROUP BY pclass, survived ORDER BY pclass, survived;",
           "SELECT survived, ROUND(AVG(age), 4) FROM titanic GROUP BY survived;",
           "SELECT pclass, ROUND(AVG(age), 4) FROM titanic GROUP BY pclass ORDER BY pclass;",
           "SELECT pclass, AVG(fare) FROM titanic GROUP BY pclass ORDER BY pclass;",
           "SELECT survived, AVG(fare) FROM titanic GROUP BY survived;",
           "SELECT pclass, ROUND(AVG(sibs_spouse), 4) FROM titanic GROUP BY pclass ORDER BY pclass;",
           "SELECT survived, ROUND(AVG(sibs_spouse), 4) FROM titanic GROUP BY survived;",
           "SELECT pclass, ROUND(AVG(par_child), 4) FROM titanic GROUP BY pclass ORDER BY pclass;",
           "SELECT survived, ROUND(AVG(par_child), 4) FROM titanic GROUP BY survived;",
           "SELECT name, COUNT(name) FROM titanic GROUP BY name HAVING COUNT(name) > 1;"]

#iterate over and print each question, execute the corresponding query, print the result:
for i in range(len(questions)):
  print(questions[i])
  pg_curs.execute(queries[i])
  print(pg_curs.fetchall())
  print('\n')

Q1: How many passengers survived, and how many died?
(survived / count)

[(0, 545), (1, 342)]


Q2: How many passengers were in each class?
(pclass / count)

[(1, 216), (2, 184), (3, 487)]


Q3: How many passengers survived/died within each class?
(pclass / survived / count)

[(1, 0, 80), (1, 1, 136), (2, 0, 97), (2, 1, 87), (3, 0, 368), (3, 1, 119)]


Q4: What was the average age of survivors vs nonsurvivors?
(survived / avg age)

[(0, Decimal('30.1229')), (1, Decimal('28.3918'))]


Q5: What was the average age of each passenger class?
(pclass / avg age)

[(1, Decimal('38.7824')), (2, Decimal('29.8478')), (3, Decimal('25.1704'))]


Q6: What was the average fare by passenger class?
(pclass / avg fare)

[(1, 84.154687528257), (2, 20.6621831810993), (3, 13.7077075010452)]


Q7: What was the average fare by survival?
(survived / avg fare)

[(0, 22.2085840951412), (1, 48.3954076976107)]


Q8: How many siblings/spouses aboard on average, by passenger class?
(pclass / avg siblings & spouses)

###Bonus Question!

Hard, may require pulling and processing with Python) How many married couples were aboard the Titanic? Assume that two people (one Mr. and one Mrs.) with the same last name and with at least 1 sibling/spouse aboard are a married couple.


In [12]:
#SQL query to grab the names of passengers w/ sibs_spouese >= 1

query = "SELECT name FROM titanic where sibs_spouse >= 1"
pg_curs.execute(query)

#assign to variable called names
names = pg_curs.fetchall()

#function to count married couples as a matching instance of Mr. lastname and Mrs. lastname
def count_married_couples(names):
  
  def name_wrangle(name):
    name = name.split(' ')
    #grab the salutation (name[0]) and the lastname (name[1])
    return [name[0], name[-1]]
  
  #instantiate empty lists for Mr. and Mrs.
  misters = []
  misses = []

  for name in names:
    #split into salutation and lastname (ignores salutations that arent Mr. or Mrs.)
    wrangled = name_wrangle(name[0])
    if wrangled[0] == 'Mr.':
      misters.append(wrangled[1])
    elif wrangled[0] == 'Mrs.':
      misses.append(wrangled[1])
      
  married_surnames = []
  #iterate over every name in misters:
  for name in misters:
    #if match found in misses
    if name in misses:
      #log the match in list of matching surnames:
      married_surnames.append(name)
      #remove the first instance of that surname from misses
      misses.remove(name)    
  return len(married_surnames) #count of married couples

#call the function and pass in the names from the SQL query
count_married_couples(names)

48