In [33]:
import re 
from unidecode import unidecode

def fix_quote_type(sent):    
    CLEANR = re.compile('<.*?>')
    def cleanhtml(raw_html):
        cleantext = re.sub(CLEANR, '', raw_html)
        return cleantext
    
    def normalize(text):
        text = '' if pd.isnull(text) else text
        text = re.sub('\s+', ' ', text)
        return cleanhtml(unidecode(text).strip())
    
    quote_type_mapper = {
        '': 'No Quote',
        'BACKGROUND': 'Background/Narrative',
        'NARRATIVE': 'Background/Narrative',
        'PUBLIC SPEECH, NOT TO JOURNO': 'Statement/Public Speech',
        'STATEMENT': 'Statement/Public Speech',
        'COMMUNICATION, NOT TO JOURNO': 'Email/Social Media Post',
        'LAWSUIT': 'Court Proceeding',
        'TWEET': 'Email/Social Media Post',
        'SOCIAL MEDIA POST': 'Email/Social Media Post',        
        'PROPOSAL': 'Proposal/Order/Law',
        'Other: LAWSUIT': 'Court Proceeding',
        'Other: Evaluation': 'Quote',
        'Other: DIRECT OBSERVATION': 'Direct Observation',
        'Other: Campaign filing': 'Published Work/Press Report',
        'Other: VOTE/POLL': 'Vote/Poll',
        'Other: PROPOSAL': 'Proposal/Order/Law',
        'Other: Campaign Filing': 'Published Work/Press Report',
        'Other: Data analysis': 'Direct Observation',
        'Other: Analysis': 'Direct Observation',
        'Other: LAW': 'Proposal/Order/Law',
        'Other: Investigation': 'Direct Observation',
        'Other: Database': 'Published Work/Press Report',
        'Other: Data Analysis': 'Direct Observation',
        'DOCUMENT': 'Published Work/Press Report',
        'PRESS REPORT': 'Published Work/Press Report',
        'PUBLISHED WORK': 'Published Work/Press Report',        
    }
    other_cats = {
        'Other: Cannot Determine': 'Other',
        'Quote': 'Other', 
        'VOTE/POLL': 'Other',
        'Vote/Poll':'Other',
        'Declined Comment': 'Other',
        'PRICE SIGNAL': 'Other',
        'DECLINED COMMENT': 'Other'
    }

    q = sent.get('quote_type', '')
    q = quote_type_mapper.get(q, q)
    q = other_cats.get(q, q )
    if (q == 'QUOTE'):
        if ('"' in normalize(sent['sent'])):
            return 'Indirect Quote'
        else:
            return 'Direct Quote'
    return q.title() if pd.notnull(q) else q

In [5]:
import jsonlines
import pandas as pd 

In [3]:
annotated_data = list(jsonlines.open('../tasks/data_split_annotated_sources.jsonl'))
annotated_data = list(map(lambda x: x['data'], annotated_data))

In [7]:
annotation_df = pd.concat(list(map(pd.DataFrame, annotated_data)))

In [9]:
annotation_df['quote_type'].value_counts()

                                11807
QUOTE                            7246
BACKGROUND                       1358
STATEMENT                        1253
PUBLISHED WORK                    698
NARRATIVE                         551
PRESS REPORT                      540
PROPOSAL/ORDER/LAW                410
PUBLIC SPEECH, NOT TO JOURNO      387
COMMUNICATION, NOT TO JOURNO      379
LAWSUIT                           253
SOCIAL MEDIA POST                 232
DIRECT OBSERVATION                137
DECLINED COMMENT                  122
VOTE/POLL                         120
DOCUMENT                          115
TWEET                              65
PRICE SIGNAL                       47
PROPOSAL                           32
Other: LAWSUIT                     17
Other: Evaluation                   9
Other: DIRECT OBSERVATION           8
Other: Campaign filing              7
Other: VOTE/POLL                    6
Other: PROPOSAL                     5
Other: Campaign Filing              4
Other: Data 

In [14]:
(annotation_df
 .loc[lambda df: df['quote_type'] == 'PRICE SIGNAL']
 ['sent']
 .loc[29]
)

'Tourist visits have declined , and the Hong Kong stock market has been falling for the past several weeks .  '

In [16]:
(annotation_df
 .loc[lambda df: df['quote_type'] == 'LAWSUIT']
 ['sent']
 .loc[2].iloc[0]
)

'In quick succession , Mr. Trump was handed defeats in Pennsylvania , Arizona and Michigan , where a state judge in Detroit rejected an unusual Republican attempt to halt the certification of the vote in Wayne County pending an audit of the count .  '

In [17]:
(annotation_df
 .loc[lambda df: df['quote_type'] == 'DIRECT OBSERVATION']
 ['sent']
 .loc[2].iloc[0]
)

'Mr. Bannon , the former chief strategist for President Trump , was warmly applauded when he addressed the party congress of the anti - immigrant National Front , led by Ms. Le Pen .'

In [21]:
annotation_df.shape

(25829, 6)

In [22]:
annotation_df.loc[lambda df: (df['quote_type'] != '')  & df['quote_type'].notnull()].shape

(14010, 6)

In [29]:
annotation_df['head'].loc[lambda df: df!=''].value_counts().shape 

(3601,)

In [40]:
annotation_df.apply(fix_quote_type, axis=1).value_counts().to_frame('count')

Unnamed: 0,count
No Quote,11807
Direct Quote,3964
Indirect Quote,3282
Background/Narrative,1909
Statement/Public Speech,1640
Published Work/Press Report,1365
Email/Social Media Post,676
Proposal/Order/Law,448
Other,305
Court Proceeding,270
