## TM names : 
- Ahmed   : 23PGAI0120
- Akash Deshwani: 23PGAI0035
- Harshada Suresh Jadhav: 23PGAI0101
- Rohan Mehta: 23PGAI0001

## Installing the required packages

In [20]:
!pip install bs4
!pip install numpy 
!pip install pandas
!pip install requests



## Importing the required packages

In [21]:
import json
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import csv

#### Importing the dataset from Json file

In [42]:
def read_file():
    a = "1_gg2015.json"
    df_original = pd.read_json(a)
    print(len(df_original))
    
    # Creating a copy of the dataframe for manipulating
    df = df_original.copy()
    
    # Creating a list of text for finding hosts
    text_host = df['text'].tolist()
    return(text_host)

In [46]:
df_test = read_file()
print(type(df_test))
print(df_test[:5])

1754153
<class 'list'>
['just had to scramble to find a golden globes stream for my brother. :D', "RT @ENews: Show us how you're watching the #GoldenGlobes -- tweet us a pic of your set up, we'll RT our faves! #ERedCarpet", '@danaKStew @50ShadesWorldcm @ScarletteDrake Also Red Carpet um 12 &amp; die Show vill. um 1?!', 'RT @lisarinna: When your husband tells you that you Are going to the #GoldenGlobes parties like 5 minutes before you go.......\nYou just gra…', '“@goldenglobes: Creating multiple mini Moët Moments on the @GoldenGlobes red carpet… http://t.co/vaLDYqbuD1\n#MoetMoment” May I have one plz?']


#### Preprocessing the data and converting it into a required list 

In [52]:
# Preprocessing data
def preprocessing(data):
    processed_data = []
    remove_list = ['think', 'thinking', 'should', 'would', 'maybe', 'could']
    for i in data:
        for j in remove_list:
            if j not in i:
                processed_data.append(i)
    remove_words_list = ['golden', 'globe', 'globes']
    for i in range(len(processed_data)):
        for j in remove_words_list:
            processed_data[i] = re.sub(j, '', processed_data[i], flags = re.IGNORECASE)
    return processed_data

In [53]:
df_1 = preprocessing(df_test)
print(df_1[:5])
print(len(df_1))

['just had to scramble to find a  s stream for my brother. :D', 'just had to scramble to find a  s stream for my brother. :D', 'just had to scramble to find a  s stream for my brother. :D', 'just had to scramble to find a  s stream for my brother. :D', 'just had to scramble to find a  s stream for my brother. :D']
10442632


#### Scraping the Top 1000 Actors and Actresses from the IMDB website

In [54]:
def scrape_actors():
    actorNames = []
    for single_page in range(1, 11):
        URL = f"https://www.imdb.com/list/ls058011111/?sort=list_order,asc&mode=detail&page={single_page}"
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        actorName = soup.find_all('h3',{'class':'lister-item-header'})
        actorName = [i.find('a').text.strip() for i in actorName]
        actorNames.extend(actorName)
    return actorNames

In [55]:
scrape_list = scrape_actors()
print(scrape_list[:5])
print(len(scrape_list))

['Robert De Niro', 'Jack Nicholson', 'Marlon Brando', 'Denzel Washington', 'Katharine Hepburn']
1000


### Extracting Hosts name from the Tweet text

In [56]:
# Finding Hosts
def get_hosts(data, actorNames):
    match = ''
    result={}
    for i in range(len(data)):
        found = re.search('(H|h)ost', data[i])
        if found:
            patt = re.compile(r'[A-Z][a-z]+ [A-Z][a-z]+')
            matches = patt.findall(data[i])
            for match in matches:
                if match in actorNames:
                    if match in result:
                        result[match] += 1
                    else:
                        result[match] = 1
    sorted_hosts = sorted(result.items(), key=lambda x:x[1], reverse=True)
    return(sorted_hosts[0:2])

In [57]:
host = get_hosts(df_1, scrape_list)
print(host)

[('Amy Poehler', 39778), ('Tina Fey', 34874)]


### Hurrah! We have successfully extracted the hosts name from the tweet text

##### Extracting the awards from the tweet text and storing it in a list

In [58]:
def get_awards(data):
    match = ''
    result = {}
    for i in range(len(data)):
        found = re.search('(B|b)est', data[i])
        if found:
            found1 = re.search(r'\bWin\b | \bWins\b | \bwin\b | \bwins\b | \bgoes to\b|', data[i])
            if found1:
                patt = re.compile(r'Best [A-Z][a-z]+ [A-Z*a-z ]+ - [A-Z*a-z ]+')
                matches = patt.findall(data[i])
                for match in matches:
                    if match.title() in result:
                        result[match.title()] += 1
                    else:
                        result[match.title()] = 1
    sorted_awards = sorted(result.items(), key=lambda x:x[1], reverse=True)
    return(sorted_awards[0:50])

In [60]:
awards = get_awards(df_1)
print(awards[:5])
print(len(awards))

[('Best Actor In A Motion Picture - Drama ', 13968), ('Best Motion Picture - Comedy Or Musical ', 11604), ('Best Supporting Actor In A Television Series - Matt Bomer ', 11238), ('Best Television Series Actor - Drama ', 10913), ('Best Actress In A Motion Picture - Comedy Or Musical ', 10415)]
50


##### we have found the awards but it is not in the required format and it has some extra characters (noise) in it

### Searching for the Presenters name in the tweet text

In [None]:
# Finding Presenters
def get_presenters(data, actorNames):
    match = ''
    result={}
    for i in range(len(data)):
        found = re.search('(P|p)resent(ing|ed|er|ers)', data[i])
        if found:
            patt = re.compile(r'[A-Z][a-z]+ [A-Z][a-z]+')
            matches = patt.findall(data[i])
            for match in matches:
                if match in actorNames:
                    if match in result:
                        result[match] += 1
                    else:
                        result[match] = 1
    sorted_presenters = sorted(result.items(), key=lambda x:x[1], reverse=True)
    return(sorted_presenters[0:15])

In [None]:
presenters = get_presenters(df_1, scrape_list)
print(presenters[:5])
print(len(presenters))

[('Jennifer Aniston', 1589), ('Salma Hayek', 1530), ('Dakota Johnson', 1376), ('Naomi Watts', 1278), ('Benedict Cumberbatch', 1177)]
50


#### Nominees for all the awards

In [63]:
# Finding Nominees
def get_nominees(data, actorNames):
    match = ''
    result={}
    for i in range(len(data)):
        found = re.search('(N|n)omin(ee|ees|ated)', data[i])
        if found:
            patt = re.compile(r'[A-Z][a-z]+ [A-Z][a-z]+')
            matches = patt.findall(data[i])
            for match in matches:
                if match in actorNames:
                    if match in result:
                        result[match] += 1
                    else:
                        result[match] = 1
    sorted_presenters = sorted(result.items(), key=lambda x:x[1], reverse=True)
    return(sorted_presenters[0:100])

In [64]:
nom = get_nominees(df_1, scrape_list)
print(nom[:5])
print(len(nom))

[('George Clooney', 2351), ('Kevin Spacey', 2201), ('Amy Adams', 1295), ('Bill Murray', 769), ('Rosamund Pike', 701)]
100


#### Finally Winners of all the awards

In [65]:
def get_winners(data, awards, actorNames):
    match = ''
    result = {}
    for i in range(len(data)):
        for j in range(len(awards)):
            found = re.search(awards[j][0].lower(), data[i].lower())
            if found:
                found1 = re.search(r'\bWin\b | \bWins\b | \bwin\b | \bwins\b | \bgoes to\b|', data[i])
                if found1:
                    patt = re.compile(r'[A-Z][a-z]+ [A-Z][a-z]+')
                    matches = patt.findall(data[i])
                    for match in matches:
                        if match in actorNames:
                            if match.title() in result:
                                result[match.title()] += 1
                            else:
                                result[match.title()] = 1
    sorted_awards = sorted(result.items(), key=lambda x:x[1], reverse=True)
    return(sorted_awards[0:50])

In [66]:
winnerslist = get_winners(df_1, awards, scrape_list)
print(winnerslist[:5])
print(len(winnerslist))

[('Julianne Moore', 20124), ('Patricia Arquette', 14622), ('Kevin Spacey', 12515), ('Amy Adams', 10589), ('Michael Keaton', 9762)]
14


In [69]:
def report(hosts, awards, presenters, nominees, winners):
    
    print('Hosts: ')
    for i in hosts:
        print(i[0])
    print()
    print('Presenters: ')
    for i in presenters:
        print(i[0])
    print()
    print('Award Names: ')
    for i in awards:
        print(i[0])
    print()
    print('Nominees: ')
    for i in nominees:
        print(i[0])
    print()
    print('Winners: ')
    for i in winners:
        print(i[0])
    print()

In [70]:
text_host = read_file()
text_host1 = preprocessing(text_host)
actorNames = scrape_actors()
hosts = get_hosts(text_host1, actorNames)
awards = get_awards(text_host1)
presenters = get_presenters(text_host1,actorNames)
nominees = get_nominees(text_host1, actorNames)
winners = get_winners(text_host1, awards, actorNames)
report(hosts, awards, presenters, nominees, winners)

1754153
