# Creating a SQLite Database from a csv File

## Imports

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("academy_awards.csv", encoding="ISO-8859-1")

In [4]:
df["Year"] = df["Year"].str[:4].astype("int64")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10137 entries, 0 to 10136
Data columns (total 11 columns):
Year               10137 non-null int64
Category           10137 non-null object
Nominee            10137 non-null object
Additional Info    9011 non-null object
Won?               10137 non-null object
Unnamed: 5         11 non-null object
Unnamed: 6         12 non-null object
Unnamed: 7         3 non-null object
Unnamed: 8         2 non-null object
Unnamed: 9         1 non-null object
Unnamed: 10        1 non-null object
dtypes: int64(1), object(10)
memory usage: 871.2+ KB


In [6]:
later_than_2000 = df[df["Year"] > 2000]

In [7]:
award_categories = [
    "Actor -- Leading Role",
    "Actor -- Supporting Role",
    "Actress -- Leading Role",
    "Actress -- Supporting Role"
]

In [8]:
nominations = later_than_2000[
    later_than_2000["Category"].isin(award_categories)]

In [12]:
nominations["Won"] = nominations["Won?"].map({
    "YES": 1,
    "NO": 0
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [15]:
drop_cols = ["Unnamed: {}".format(i) for i in [5,6,7,8,9,10]]
drop_cols.append("Won?")
drop_cols

['Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Unnamed: 9',
 'Unnamed: 10',
 'Won?']

In [17]:
final_nominations = nominations.drop(drop_cols, axis=1)

In [22]:
final_nominations["Additional Info"] = final_nominations["Additional Info"].str.rstrip("'}")

In [26]:
final_nominations["additional_info_list"] = final_nominations["Additional Info"].str.split(" {'") 

In [29]:
final_nominations["Movie"]  = final_nominations["additional_info_list"].str[0]

In [31]:
final_nominations["Character"] = final_nominations["additional_info_list"].str[1]

In [32]:
final_nominations.head()

Unnamed: 0,Year,Category,Nominee,Additional Info,Won,additional_info_list,Movie,Character
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal,0,"[Biutiful, Uxbal]",Biutiful,Uxbal
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn,0,"[True Grit, Rooster Cogburn]",True Grit,Rooster Cogburn
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg,0,"[The Social Network, Mark Zuckerberg]",The Social Network,Mark Zuckerberg
3,2010,Actor -- Leading Role,Colin Firth,The King's Speech {'King George VI,1,"[The King's Speech, King George VI]",The King's Speech,King George VI
4,2010,Actor -- Leading Role,James Franco,127 Hours {'Aron Ralston,0,"[127 Hours, Aron Ralston]",127 Hours,Aron Ralston


In [34]:
final_nominations.drop(["Additional Info", "additional_info_list"], 
                       inplace=True, axis=1)

In [35]:
final_nominations.head()

Unnamed: 0,Year,Category,Nominee,Won,Movie,Character
0,2010,Actor -- Leading Role,Javier Bardem,0,Biutiful,Uxbal
1,2010,Actor -- Leading Role,Jeff Bridges,0,True Grit,Rooster Cogburn
2,2010,Actor -- Leading Role,Jesse Eisenberg,0,The Social Network,Mark Zuckerberg
3,2010,Actor -- Leading Role,Colin Firth,1,The King's Speech,King George VI
4,2010,Actor -- Leading Role,James Franco,0,127 Hours,Aron Ralston


Clean Data! Now we move on to setting up the database.

In [36]:
import sqlite3

In [37]:
conn = sqlite3.connect("nominations.db")

In [39]:
final_nominations.to_sql("nominations", conn,index=False)

In [45]:
curr = conn.cursor()

In [46]:
curr.execute("PRAGMA table_info")

<sqlite3.Cursor at 0x7fb7601d7180>

In [48]:
curr.execute("select * from nominations limit 10")

<sqlite3.Cursor at 0x7fb7601d7180>

In [49]:
curr.fetchall()

[(2010, 'Actor -- Leading Role', 'Javier Bardem', 0, 'Biutiful', 'Uxbal'),
 (2010,
  'Actor -- Leading Role',
  'Jeff Bridges',
  0,
  'True Grit',
  'Rooster Cogburn'),
 (2010,
  'Actor -- Leading Role',
  'Jesse Eisenberg',
  0,
  'The Social Network',
  'Mark Zuckerberg'),
 (2010,
  'Actor -- Leading Role',
  'Colin Firth',
  1,
  "The King's Speech",
  'King George VI'),
 (2010,
  'Actor -- Leading Role',
  'James Franco',
  0,
  '127 Hours',
  'Aron Ralston'),
 (2010,
  'Actor -- Supporting Role',
  'Christian Bale',
  1,
  'The Fighter',
  'Dicky Eklund'),
 (2010,
  'Actor -- Supporting Role',
  'John Hawkes',
  0,
  "Winter's Bone",
  'Teardrop'),
 (2010,
  'Actor -- Supporting Role',
  'Jeremy Renner',
  0,
  'The Town',
  'James Coughlin'),
 (2010,
  'Actor -- Supporting Role',
  'Mark Ruffalo',
  0,
  'The Kids Are All Right',
  'Paul'),
 (2010,
  'Actor -- Supporting Role',
  'Geoffrey Rush',
  0,
  "The King's Speech",
  'Lionel Logue')]

In [50]:
conn.close()