In [2]:
import os # Accessing directory structure
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #Plotting

In [None]:
print(os.listdir('../'))

# looking at the data
for line in open('../DataSet/train.tsv', 'r').readlines():
    print(line.strip('\n'))

In [20]:
def read_dataframe(tsv_file: str) -> pd.DataFrame:
    # creates a "dataframe" or "df" for short. This is similar to a 2-D python dict.
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]
    return df

df = read_dataframe('../DataSet/train.tsv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10239 entries, 0 to 10238
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 10239 non-null  object
 1   label              10239 non-null  object
 2   statement          10239 non-null  object
 3   subjects           10239 non-null  object
 4   speaker            10239 non-null  object
 5   speaker_job_title  10239 non-null  object
 6   state_info         10239 non-null  object
 7   party_affiliation  10239 non-null  object
 8   count_1            10239 non-null  object
 9   count_2            10239 non-null  object
 10  count_3            10239 non-null  object
 11  count_4            10239 non-null  object
 12  count_5            10239 non-null  object
 13  context            10239 non-null  object
dtypes: object(14)
memory usage: 1.1+ MB


In [23]:
df.head(10)

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece
5,2342.json,barely-true,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,republican,3,1,1,3,1,a press release.
6,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70,71,160,163,9,"a Democratic debate in Philadelphia, Pa."
7,5602.json,half-true,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,organization,0,0,1,0,1,a website
8,9741.json,mostly-true,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0,0,0,1,0,an online video
9,7115.json,mostly-true,"For the first time in history, the share of th...",elections,robert-menendez,U.S. Senator,New Jersey,democrat,1,3,1,3,0,a speech


In [24]:
df.loc[df['subjects'] == 'economy,jobs']

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
113,10423.json,true,The economy is creating jobs at the fastest pa...,"economy,jobs",barack-obama,President,Illinois,democrat,70,71,160,163,9,the State of the Union address
132,7803.json,barely-true,We are already almost halfway to our 2010 goal...,"economy,jobs",rick-scott,Governor,Florida,republican,28,23,38,34,7,2013-14 veto message
192,9289.json,half-true,Every single one of the 41 Republican senators...,"economy,jobs",facebook-posts,Social media posting,,none,14,18,15,11,36,a meme shared on social media
202,6265.json,barely-true,The state of Ohio has one of the lowest unempl...,"economy,jobs",bill-johnson,U.S. representative,Ohio,republican,2,1,0,0,0,a telephone town hall meeting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9894,5686.json,true,We were the No. 1 job creator in America in Fe...,"economy,jobs",john-kasich,"Governor of Ohio as of Jan. 10, 2011",Ohio,republican,9,8,10,18,3,an interview with Fox News
9922,6672.json,mostly-true,Were seeing now 30 straight months of private-...,"economy,jobs",julian-castro,"Mayor, San Antonio",Texas,democrat,0,0,3,3,0,"an interview on ""Meet the Press."""
9985,9715.json,true,Its been 17 years that weve had unemployment h...,"economy,jobs",dennis-richardson,state representative,Oregon,republican,0,4,1,2,0,a campaign debate
10172,6653.json,half-true,"Weve got 7.2 percent unemployment (in Ohio), b...","economy,jobs",rob-portman,U.S. senator from Ohio,Ohio,republican,3,5,11,11,1,a television interview
