# search and download webpages

In [8]:
# ! pip3 install requests

In [1]:
import pandas as pd

In [3]:
import requests
import json

index_list = ["2018-43"]


### -----------------------
### Searches the Common Crawl Index for a domain.
### -----------------------
def search_domain(domain, limit=20):
    record_list = []
    print("[*] Trying target domain: %s" % domain)
    
    for index in index_list:
        print("[*] Trying index %s" % index)
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index" % index
        
        response = requests.get(cc_url,params={'limit':limit,'url':domain,'output':'json'})
        
        if response.status_code == 200:
            records = response.content.splitlines()
            for record in records:
                record_list.append(json.loads(record))  
            print("[*] Added %d results." % len(records))
    print("[*] Found a total of %d hits." % len(record_list))
    return record_list

In [4]:
records=search_domain('https://economictimes.indiatimes.com/industry/auto/*')

[*] Trying target domain: https://economictimes.indiatimes.com/industry/auto/*
[*] Trying index 2018-43
[*] Added 20 results.
[*] Found a total of 20 hits.


In [5]:
[r['url'] for r in records]

['https://economictimes.indiatimes.com/industry/auto/100-ev-by-2030-in-india-is-only-an-aspiration/videoshow/61463373.cms',
 'https://economictimes.indiatimes.com/industry/auto/auto-components-makers-will-have-more-opportunity-than-disruption/videoshow/60710789.cms',
 'https://economictimes.indiatimes.com/industry/auto/auto-components/articlelist/64829316.cms?from=mdr',
 'https://economictimes.indiatimes.com/industry/auto/auto-news/abb-plans-to-make-ev-chargers-in-india-by-year-end/articleshow/66021903.cms',
 'https://economictimes.indiatimes.com/industry/auto/auto-news/articlelist/64829342.cms?from=mdr',
 'https://economictimes.indiatimes.com/industry/auto/auto-news/ashok-leyland-sales-up-26-pc-at-19373-units-in-september/articleshow/66025595.cms',
 'https://economictimes.indiatimes.com/industry/auto/auto-news/auto-companies-making-electric-vehicles-may-get-to-earn-tradable-credits/articleshow/66036489.cms',
 'https://economictimes.indiatimes.com/industry/auto/auto-news/auto-component

In [51]:
records[:1]

[{'urlkey': 'com,indiatimes,economictimes)/magazines/panache/-baahubali-2-to-be-third-indian-film-to-release-in-imax-print/articleshow/57673018.cms',
  'timestamp': '20181016210226',
  'filename': 'crawl-data/CC-MAIN-2018-43/segments/1539583510867.6/warc/CC-MAIN-20181016201314-20181016222814-00362.warc.gz',
  'url': 'https://economictimes.indiatimes.com/magazines/panache/-baahubali-2-to-be-third-indian-film-to-release-in-imax-print/articleshow/57673018.cms',
  'offset': '698387968',
  'status': '200',
  'length': '36320',
  'mime': 'text/html',
  'languages': 'eng,hin',
  'charset': 'UTF-8',
  'digest': 'ZSFMIBIMRXJPDI7RB45VSUTWIEZCNHYI',
  'mime-detected': 'text/html'}]

In [8]:
import re
re.sub(r'\W+',r'_', 'http://news18.com/news/movies/*')

'http_news18_com_news_movies_'

In [28]:
# for r in records:
#     if r['url'].
product_records=[r for r in records if '/tea/' in r['url']]
[r['url'] for r in product_records][:10]

['https://www.teabox.com/tea/adderley-twirl-winter-nilgiri-black-tea',
 'https://www.teabox.com/tea/antu-valley-classic-spring-nepal-black',
 'https://www.teabox.com/tea/apple-florentine-tea',
 'https://www.teabox.com/tea/arum-raisin-tea',
 'https://www.teabox.com/tea/arya-clonal-autumn-darjeeling-black-tea',
 'https://www.teabox.com/tea/assam-masala-chai-tea',
 'https://www.teabox.com/tea/assam-sunshine-blend-summer-black-tea',
 'https://www.teabox.com/tea/avongrove-clonal-organic-autumn-darjeeling-black-tea',
 'https://www.teabox.com/tea/balijan-thunder-summer-assam-black-tea',
 'https://www.teabox.com/tea/barnesbeg-summer-darjeeling-organic-green-tea']

In [53]:
import gzip
import io

# Version 1.2
## Author: David Cedar(2017)
#
# Downloads full page
#
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'

    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data = io.BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)

    # What we have now is just the WARC response, formatted:
    data = f.read()

    response = ""

    if len(data):
        try:
            warc, header, response = data.strip().split(b'\r\n\r\n', 2)
        except:
            pass

    return response

In [None]:
import boto3
import botocore

# crawl-data/CC-MAIN-2018-43/segments/1539583513548.72/warc/CC-MAIN-20181021010654-20181021032154-00021.warc.gz

BUCKET_NAME = 'commoncrawl' # replace with your bucket name
KEY = 'crawl-data/CC-MAIN-2018-43/segments/1539583513548.72/warc/CC-MAIN-20181021010654-20181021032154-00021.warc.gz' # replace with your object key

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_image.jpg')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [146]:
! pip3 install boto3

Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/8e/92/58b5292e2278d4dfcbdc61009ada07cbace03355f256e83c67a95244f07b/boto3-1.9.46-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 350kB/s ta 0:00:01
[?25hCollecting botocore<1.13.0,>=1.12.46 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/d3/56/5e640fa7cf01ba13c727699fb8cbbe1dd6786caf78b3ca0199e3b07f86c0/botocore-1.12.46-py2.py3-none-any.whl (4.8MB)
[K    100% |████████████████████████████████| 4.8MB 6.1MB/s eta 0:00:01
[?25hCollecting s3transfer<0.2.0,>=0.1.10 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/d7/14/2a0004d487464d120c9fb85313a75cd3d71a7506955be458eebfe19a6b1d/s3transfer-0.1.13-py2.py3-none-any.whl (59kB)
[K    100% |████████████████████████████████| 61kB 17.8MB/s ta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1 (from boto3)
  Downloading https://files.pythonhosted.org/packages/b7/31/05c8d001f7f87f0f07289a5fc0

In [9]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

BUCKET='commoncrawl'
KEY='crawl-data/CC-MAIN-2018-43/segments/1539583510866.52/warc/CC-MAIN-20181016180631-20181016202131-00307.warc.gz'
PAGE_OFFSET=1010368906
PAGE_LEN=36721

# anonymous connection
s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))

In [3]:
bkt=s3.Bucket(BUCKET)

In [55]:
obj=s3.Object(BUCKET, KEY)
fstream=obj.get(Range='bytes={}-{}'.format(PAGE_OFFSET, PAGE_OFFSET+PAGE_LEN-1))['Body']
file=fstream.read()

In [109]:
import io
import gzip
raw_data=io.BytesIO(file)
f=gzip.GzipFile(fileobj=raw_data)

In [None]:
with open(pickle)

In [39]:
! pip3 install warcio

Collecting warcio
[?25l  Downloading https://files.pythonhosted.org/packages/cf/99/2ee69694e3dea824a82d394a2ec1a8cc9130db641da84de1166d98b63e8b/warcio-1.6.3-py2.py3-none-any.whl (40kB)
[K    100% |████████████████████████████████| 40kB 288kB/s ta 0:00:011
Installing collected packages: warcio
Successfully installed warcio-1.6.3


In [110]:
from warcio.archiveiterator import ArchiveIterator
arc_iter=ArchiveIterator(f)
# for record in ArchiveIterator(f):
#     print(record.rec_type, record.content_stream().read())

In [112]:
arc_iter.__next__()

<warcio.recordloader.ArcWarcRecord at 0x118f5c320>

In [18]:
import pandas as pd
df=pd.read_json('/Users/akshit.jain/repos/commoncrawl/output/matching_urls.json', orient='records', lines=True)


In [98]:
df.head()

Unnamed: 0,charset,digest,filename,languages,length,mime,mime-detected,offset,status,timestamp,url,urlkey
0,UTF-8,ING23JTUS4DNB5Q4EBP4VFRIZOEMZTTE,crawl-data/CC-MAIN-2018-43/segments/1539583510...,eng,38643,text/html,text/html,1037204307,200,1970-08-22 13:50:17.013035,https://www.news18.com/news/movies/10-ka-dum-a...,"com,news18)/news/movies/10-ka-dum-after-winnin..."
1,UTF-8,GXDYMFM24ONBVVQR5KXPTCYLQ5WBP3XS,crawl-data/CC-MAIN-2018-43/segments/1539583515...,eng,31409,text/html,text/html,1012036861,200,1970-08-22 13:50:22.150356,https://www.news18.com/news/movies/10-things-a...,"com,news18)/news/movies/10-things-akshay-kumar..."
2,UTF-8,X7Q64J74QNJZ3RKNOUDFU3TVCBF77SVI,crawl-data/CC-MAIN-2018-43/segments/1539583512...,eng,34257,text/html,text/html,1037009554,200,1970-08-22 13:50:19.054131,https://www.news18.com/news/movies/10-things-a...,"com,news18)/news/movies/10-things-akshay-kumar..."
3,UTF-8,RWIED4P3UIOQJSROR6QV4JSR73GSKJHL,crawl-data/CC-MAIN-2018-43/segments/1539583513...,eng,31639,text/html,text/html,1006779471,200,1970-08-22 13:50:21.012315,https://www.news18.com/news/movies/10-things-a...,"com,news18)/news/movies/10-things-akshay-kumar..."
4,UTF-8,WYYRI67LYHDCM3R55NCUUWMUFYIAZWJ6,crawl-data/CC-MAIN-2018-43/segments/1539583515...,eng,31420,text/html,text/html,1006849651,200,1970-08-22 13:50:22.184831,https://www.news18.com/news/movies/10-things-y...,"com,news18)/news/movies/10-things-you-should-k..."


In [100]:
df.shape[0]

3051

In [87]:
record.content_stream().read()

b''

In [29]:
! mkdir data

In [56]:
html_content=download_page(records[0])

In [11]:
import pickle as pk

# with open('data/foo.html','wb') as f:
#     pk.dump(html_content,f)
# with open('data/foo.html','rb') as f:
#     html_content=pk.load(f)
with open('output/http_economictimes_indiatimes_com_industry_auto_/page_dumps/4bea23c674d9a116e1cff435c6cdd0d3.html') as f:
    html_content=f.read()

In [75]:
! pip3 install beautifulsoup4
! pip3 install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/da/9c/901d13b9d84262082e81d38879600dcec28beb994ae08d1a7cbab4dc3ece/lxml-4.2.5-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (8.6MB)
[K    100% |████████████████████████████████| 8.6MB 3.8MB/s ta 0:00:011
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.2.5


In [12]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "lxml")

In [13]:
def clean_me(soup):
    for tag in ['script', 'style','meta','noscript']:
        [x.extract() for x in soup.find_all(tag)]
    return soup
    for s in soup(['script', 'style']):
        s.decompose()
    return '\n'.join(soup.stripped_strings)
# print(clean_me(soup))

# extract article text from html

In [24]:
doc=soup.find('div',{'class':'artText'}).get_text()
# soup=clean_me(soup)
print(soup.find('div', {'id':'article_body'}).text)

In [21]:
soup.find('div',{'class':'publish_on'})

<div class="publish_on flt">Sep 20, 2018, 06.41 AM IST</div>

In [64]:
doc

"MUMBAI: One of the most awaited sequel, ‘Baahubali 2: The Conclusion’, will release in IMAX format, the IMAX Corporation announced on Thursday.  Arka Mediaworks’ fantasy film will be digitally re-mastered in the immersive format and released in IMAX theatres in India on April 28 and across select international markets later.  ‘Baahubali 2’ marks the third Indian production to be released in IMAX. Prior to this, Aamir Khan-starrer ‘Dhoom:3’ and Hrithik Roshan-Katrina Kaif starrer ‘Bang Bang!’ were digitally re-mastered into IMAX format.  Directed by SS Rajamouli, ‘Baahubali 2: The Conclusion’ is the continuation of the 2015 box-office hit ‘Baahubali: The Beginning’. The sequel will see Prabhas along with Rana Daggubati, Anushka Shetty and Tamannaah Bhatia.  “A major factor for the wide appeal of the Baahubali films is the scale and immersive nature with which we designed and filmed them. So it is very exciting that 'Baahubali 2: The Conclusion' will be released in the IMAX format, whic

# test preprocessing with nltk

In [66]:
! pip3 install nltk

Collecting nltk
  Using cached https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25ldone
[?25h  Stored in directory: /Users/akshit.jain/Library/Caches/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.3


In [67]:
import nltk

In [69]:
sents

['MUMBAI: One of the most awaited sequel, ‘Baahubali 2: The Conclusion’, will release in IMAX format, the IMAX Corporation announced on Thursday.',
 'Arka Mediaworks’ fantasy film will be digitally re-mastered in the immersive format and released in IMAX theatres in India on April 28 and across select international markets later.',
 '‘Baahubali 2’ marks the third Indian production to be released in IMAX.',
 'Prior to this, Aamir Khan-starrer ‘Dhoom:3’ and Hrithik Roshan-Katrina Kaif starrer ‘Bang Bang!’ were digitally re-mastered into IMAX format.',
 'Directed by SS Rajamouli, ‘Baahubali 2: The Conclusion’ is the continuation of the 2015 box-office hit ‘Baahubali: The Beginning’.',
 'The sequel will see Prabhas along with Rana Daggubati, Anushka Shetty and Tamannaah Bhatia.',
 '“A major factor for the wide appeal of the Baahubali films is the scale and immersive nature with which we designed and filmed them.',
 "So it is very exciting that 'Baahubali 2: The Conclusion' will be released

In [68]:
from nltk.tokenize import sent_tokenize
sents = sent_tokenize(doc)

In [70]:
tokens = nltk.word_tokenize(sents[0])

In [71]:
tokens

['MUMBAI',
 ':',
 'One',
 'of',
 'the',
 'most',
 'awaited',
 'sequel',
 ',',
 '‘',
 'Baahubali',
 '2',
 ':',
 'The',
 'Conclusion',
 '’',
 ',',
 'will',
 'release',
 'in',
 'IMAX',
 'format',
 ',',
 'the',
 'IMAX',
 'Corporation',
 'announced',
 'on',
 'Thursday',
 '.']

In [74]:
nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/akshit.jain/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [75]:
tagged

[('MUMBAI', 'NN'),
 (':', ':'),
 ('One', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('most', 'RBS'),
 ('awaited', 'JJ'),
 ('sequel', 'NN'),
 (',', ','),
 ('‘', 'NNP'),
 ('Baahubali', 'NNP'),
 ('2', 'CD'),
 (':', ':'),
 ('The', 'DT'),
 ('Conclusion', 'NNP'),
 ('’', 'NNP'),
 (',', ','),
 ('will', 'MD'),
 ('release', 'VB'),
 ('in', 'IN'),
 ('IMAX', 'NNP'),
 ('format', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('IMAX', 'NNP'),
 ('Corporation', 'NNP'),
 ('announced', 'VBD'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('.', '.')]

In [81]:
import nltk
# nltk.download('tagsets')
nltk.help.upenn_tagset('CD')

CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...


# NER with spacy

In [91]:
# ! pip3 install -U spacy
# ! python3 -m spacy download en
! python3 -m spacy download en_core_web_md

Collecting en_core_web_md==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz#egg=en_core_web_md==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K    100% |████████████████████████████████| 120.9MB 1.4MB/s ta 0:00:0111
[?25hInstalling collected packages: en-core-web-md
  Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.7/site-packages/en_core_web_md -->
    /usr/local/lib/python3.7/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [175]:
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_md')

In [176]:
from pprint import pprint
doc_parsed=nlp(doc)
pprint([(X.text, X.label_) for X in doc_parsed.ents])

[('Mumbai', 'GPE'),
 ('Megastar Amitabh Bachchan', 'PERSON'),
 ('Thugs of Hindostan', 'ORG'),
 (' \r', 'CARDINAL'),
 ('\n\r', 'ORG'),
 ('Russia', 'GPE'),
 ('Russian', 'LANGUAGE'),
 ('tomorrow', 'DATE'),
 (' \t \r\n\t', 'PERSON'),
 ('75', 'CARDINAL'),
 ('day', 'DATE'),
 ('Jodhpur', 'GPE'),
 ('Bachchan', 'PERSON'),
 ('Aamir Khan', 'PERSON'),
 ('Fatima Sana Shaikh', 'PERSON'),
 ('Katrina Kaif', 'PERSON'),
 ('Diwali 2018', 'DATE'),
 ('Yash Raj Films', 'ORG'),
 ('Vijay Krishna Acharya', 'PERSON'),
 ('Katrina', 'PERSON'),
 ('Aamir', 'PERSON'),
 ('Dhoom', 'PERSON'),
 ('3', 'CARDINAL'),
 ('1839', 'DATE'),
 ('Confessions of a Thug', 'WORK_OF_ART'),
 ('102', 'CARDINAL'),
 ('Rishi Kapoor', 'PERSON'),
 ('HindostanThugs', 'ORG'),
 ('Hindostan Shoot', 'PERSON')]


In [179]:
doc

'\nMumbai: Megastar Amitabh Bachchan is nearing the end of his shooting schedule for Thugs of Hindostan, which has been a strenuous project for him.  \rBig B wrote about it on his blog. \n\r"The work pressure increased. Some issues arose and that did not allow me the liberty of getting out of \'Russia\'. Something that can easily come off and on because of the intricate prosthetic make-up for \'Thugs of Hindostan\' and the time it takes. So, look a bit Russian too with that woolly cap.  \n\n \r\n "But work on the \'Thugs of Hindostan\' reaches an end soon like for me perhaps tomorrow then there are the promotions to do. Invent and write to execute them... it\'s work and more work... and it should never stop," he wrote.  \t \r\n\tThe cine icon, 75, was geared up for "another rough day", maybe the last for him on the film. \rIt was during the shoot of "Thugs of Hindostan" in Jodhpur when a team of doctors had to be flown in to check on Bachchan, who had been involved in some action scene

# IMDB Dataset for Distant Supervision
## https://www.imdb.com/interfaces/

In [None]:
! pushd data; wget https://datasets.imdbws.com/title.akas.tsv.gz .; gzip -d title.akas.tsv.gz; popd

In [97]:
! pip3 install pandas

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/6b/dc/3a88b7bf8437f3f052fc90de72f28c06248142821a7f108e10ff3be5eb59/pandas-0.23.4-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.4MB)
[K    100% |████████████████████████████████| 14.4MB 2.7MB/s eta 0:00:01
Collecting pytz>=2011k (from pandas)
[?25l  Downloading https://files.pythonhosted.org/packages/f8/0e/2365ddc010afb3d79147f1dd544e5ee24bf4ece58ab99b16fbb465ce6dc0/pytz-2018.7-py2.py3-none-any.whl (506kB)
[K    100% |████████████████████████████████| 512kB 19.9MB/s ta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-0.23.4 pytz-2018.7


In [98]:
import pandas as pd

## load movie titles

In [35]:
mov=pd.read_csv('data/title.akas.tsv', sep='\t')#, na_values=['\N']

  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
mov.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
1,tt0000001,2,Карменсита,RU,\N,\N,\N,0
2,tt0000001,3,Carmencita,US,\N,\N,\N,0
3,tt0000001,4,Carmencita,\N,\N,original,\N,1
4,tt0000002,1,Le clown et ses chiens,\N,\N,original,\N,1


In [103]:
'IN' in mov.region.unique()

True

In [37]:
mov_in=mov[mov.region=='IN']
mov_in.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
6889,tt0003311,3,Raja Harishchandra,IN,\N,\N,\N,0
6891,tt0003311,5,King Harishchandra,IN,en,\N,literal title,0
28022,tt0011965,1,Bhakta Vidur,IN,\N,\N,\N,0
28044,tt0011971,2,Bilet Pherat,IN,\N,\N,\N,0
32673,tt0013568,2,Sadhu Aur Shaitaan,IN,hi,\N,alternative spelling,0


In [112]:
mov_in[mov_in.title.str.contains('Piku')]

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
2638153,tt3767372,3,Piku,IN,\N,\N,\N,0


## Load actors

In [38]:
act=pd.read_csv('data/title.principals.tsv.gz',sep='\t')
act.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Herself""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [39]:
act_in=act[act.tconst.isin(mov_in.titleId)]

In [120]:
act_in.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
17283,tt0003311,1,nm0196409,actor,\N,"[""Raja Harishchandra""]"
17284,tt0003311,2,nm0762507,actor,\N,"[""Taramati""]"
17285,tt0003311,3,nm0679609,actor,\N,"[""Rohitas""]"
17286,tt0003311,4,nm0762500,actor,\N,"[""Vishwamitra""]"
17287,tt0003311,5,nm0679610,director,\N,\N


In [40]:
names=pd.read_csv('data/name.basics.tsv.gz', sep='\t')
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0053137,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0071877,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0057345,tt0059956,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,writer,soundtrack","tt0077975,tt0078723,tt0080455,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0050986,tt0083922,tt0050976"


In [41]:
in_acts_list=act_in.nconst.unique()
names_in=names[names.nconst.isin(in_acts_list)]
names_in.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
50,nm0000051,James Mason,1909,1984,"actor,producer,writer","tt0046672,tt0084855,tt0056193,tt0047522"
58,nm0000059,Laurence Olivier,1907,1989,"actor,producer,director","tt0040416,tt0032976,tt0069281,tt0054331"
91,nm0000092,John Cleese,1939,\N,"actor,writer,producer","tt0063929,tt0071853,tt0085959,tt0095159"
95,nm0000096,Gillian Anderson,1968,\N,"actress,producer,soundtrack","tt0442632,tt0455590,tt2294189,tt0106179"
103,nm0000104,Antonio Banderas,1960,\N,"actor,soundtrack,producer","tt0112851,tt1189073,tt0120746,tt0120657"


In [130]:
names_in

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
50,nm0000051,James Mason,1909,1984,"actor,producer,writer","tt0046672,tt0084855,tt0056193,tt0047522"
58,nm0000059,Laurence Olivier,1907,1989,"actor,producer,director","tt0040416,tt0032976,tt0069281,tt0054331"
91,nm0000092,John Cleese,1939,\N,"actor,writer,producer","tt0063929,tt0071853,tt0085959,tt0095159"
95,nm0000096,Gillian Anderson,1968,\N,"actress,producer,soundtrack","tt0442632,tt0455590,tt2294189,tt0106179"
103,nm0000104,Antonio Banderas,1960,\N,"actor,soundtrack,producer","tt0112851,tt1189073,tt0120746,tt0120657"
106,nm0000107,Kim Basinger,1953,\N,"actress,soundtrack,producer","tt0096895,tt0109890,tt0119488,tt0337921"
111,nm0000112,Pierce Brosnan,1953,\N,"actor,producer,soundtrack","tt0795421,tt0143145,tt0120347,tt0113189"
114,nm0000115,Nicolas Cage,1964,\N,"actor,producer,soundtrack","tt0435705,tt0113627,tt0117500,tt0119094"
116,nm0000117,Neve Campbell,1973,\N,"actress,producer,soundtrack","tt0120082,tt0134084,tt1262416,tt0117571"
117,nm0000118,John Carpenter,1948,\N,"soundtrack,writer,music_department","tt0080749,tt0093777,tt0082340,tt0077651"


In [42]:
del act, names, mov
act=act_in
names=names_in
mov=mov_in

In [43]:
act.to_pickle('data/person_movie.pk')
names.to_pickle('data/person_names.pk')
mov.to_pickle('data/movie_titles.pk')

In [102]:
import hashlib
mystring = 'foo'
hash_object = hashlib.md5(mystring.encode()).hexdigest()
print(hash_object.hexdigest())

<class 'str'>


# check parsed pages

In [131]:
import os
os.chdir('/Users/akshit.jain/repos/commoncrawl')
articles=pd.read_pickle('output/http_www_news18_com_news_auto_/parsed_pages.pk')

In [132]:
# articles=articles[:500]
articles=articles[~articles.meta.apply(lambda x: x.story is None)].reset_index()


In [133]:
print(articles.iloc[5].meta.story)


	Auction house Artcurial Motorcars has revealed some of the collector cars set to go under the hammer at its upcoming auction, held alongside the Rétromobile classic and collector car show in Paris, Friday, February 9, 2018.   News18.com Presents Tech and Auto Awards 2017 | Vote For Your Favourite Smartphone, Car, Two-Wheeler And More! 
The star of next year's sale is likely to be a Ferrari 275P, chassis number #0816, owned by Pierre Bardinon, which won the Le Mans 24 hour race in 1964 with Jean Guichet and Nino Vaccarella at the wheel. This was the eighth and final win for the Ferrari team at Le Mans. Following the win, the car was sold and shipped to the USA where it continued to be driven in races for several years.  


  	 
	While the full auction catalog is expected to be finalized by mid-December, a selection of models have already been revealed, including a 1939 Bugatti 57C Atalante coupé; several Porsche cars, including a 904 GTS from the collection of Jean-Claude Miloé; ar

In [51]:
! pip3 install whoosh

Collecting whoosh
[?25l  Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)
[K    100% |████████████████████████████████| 471kB 1.6MB/s ta 0:00:01
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


In [137]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(body=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                url=ID(stored=True), id=ID(stored=True))

In [138]:
import os, os.path
from whoosh import index

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)

In [139]:
from whoosh import writing
from whoosh.writing import BufferedWriter, AsyncWriter

with AsyncWriter(ix) as writer:
    try:
        writer.mergetype = writing.CLEAR
        def add_doc(article, id):
            writer.add_document(body=article.story, url=article.url, id=str(id))
        for idx, row in articles.iterrows():
            if idx%100==0:
                print('#{}'.format(idx))
            add_doc(row.meta, idx)
        writer.commit()
    finally:
        pass
        #writer.close()
        

#0
#100
#200
#300
#400
#500
#600
#700
#800
#900
#1000
#1100
#1200
#1300


IndexingError: This writer is closed

In [90]:
from whoosh.qparser import QueryParser

qp = QueryParser("body", schema=ix.schema)
q = qp.parse(u"nano")

with ix.searcher() as s:
    results = s.search(q)
    print(len(results), results[0])

1 <Hit {'body': '\r\n\tAs the fuel prices have reached an all-time high in India, and with no sign of respite from the BJP-led central government, people are looking for alternative options of commuting. The petrol prices in Delhi as of 23rd May, 2018 stands at Rs 77.17 and diesel at Rs 68.34 (IOCL sourced prices). Public transportation or carpooling are a couple of options to offset the increasing fuel prices, but there are a lot of people in India who have to drive their own vehicle to office, because of lack of public transportation in their respective areas.   \rWhile electric cars are a credible option, the choices are very few. In comes the CNG (compressed natural gas) cars that run both on CNG and petrol. People living in metro cities have a big advantage of driving their cars on CNG fuel, as it is both cheap to refill, and better in mileage as compared to the diesel and petrol. However, people tend to go for an aftermarket fitment of CNG kits that might not be as safe as the co

# structured instance occ. in unstructured data

In [47]:
names[names.primaryName.str.contains('Amitabh Bachchan')]

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
820,nm0000821,Amitabh Bachchan,1942,\N,"actor,producer,writer","tt0337578,tt0375611,tt5571734,tt0248126"
5564843,nm5928994,Junior Amitabh Bachchan,\N,\N,actor,"tt4319640,tt0405069,tt3949164,tt6209476"


In [62]:
act_denorm=pd.merge(act[['tconst','nconst','category']], names[['nconst','primaryName']], on='nconst',how='inner')
mov_cast=pd.merge(act_denorm, mov[['titleId','title']], left_on='tconst', right_on='titleId',how='inner')[['title','primaryName','category']]
mov_cast.head()

Unnamed: 0,title,primaryName,category
0,Raja Harishchandra,D.D. Dabke,actor
1,King Harishchandra,D.D. Dabke,actor
2,Raja Harishchandra,P.G. Sane,actor
3,King Harishchandra,P.G. Sane,actor
4,Raja Harishchandra,Bhalachandra D. Phalke,actor


In [71]:
mov_cast.rename(columns={'title':'movie','primaryName':'actor','category':'role'}, inplace=True)
mov_cast.to_pickle('data/indian_movies_cast.pk')

In [72]:
mov_cast.head()

Unnamed: 0,movie,actor,role
0,Raja Harishchandra,D.D. Dabke,actor
1,King Harishchandra,D.D. Dabke,actor
2,Raja Harishchandra,P.G. Sane,actor
3,King Harishchandra,P.G. Sane,actor
4,Raja Harishchandra,Bhalachandra D. Phalke,actor


In [74]:
_df=mov_cast[mov_cast.actor=='Amitabh Bachchan']

In [95]:
_df

Unnamed: 0,movie,actor,role
10373,Mili,Amitabh Bachchan,actor
10401,Mahaan,Amitabh Bachchan,actor
11943,Kishore Kumar-Zindagi Ek Safar,Amitabh Bachchan,actor
11944,Zindagi Ek Safar,Amitabh Bachchan,actor
13163,Alaap,Amitabh Bachchan,actor
15812,Parwana,Amitabh Bachchan,actor
17025,Ganga Ki Saugand,Amitabh Bachchan,actor
19580,Paan khaye Saiyan Hamaar,Amitabh Bachchan,actor
22804,Saudagar,Amitabh Bachchan,actor
22961,Mysterious Manoeuvre,Amitabh Bachchan,actor


In [142]:
models="""KTM 125 Duke
Jawa
Yamaha YZF R15 V3
Honda Activa 5G
Royal Enfield Classic 350
KTM 200 Duke
Bajaj Pulsar 150
TVS Apache RTR 160
Royal Enfield Bullet 350
Hero Splendor Plus
Bajaj Pulsar 220 F
TVS Apache RTR 160 4V
Honda CB Shine
Bajaj Pulsar NS200
TVS Apache RTR 200 4V Race Edition 2.0
Yamaha FZ S FI (V 2.0)
Bajaj Pulsar RS200
Honda Dio
Honda CB Hornet 160R
KTM 390 Duke
Hero Destini 125
TVS Jupiter
Hero HF Deluxe
TVS Apache RR 310
Hero Super Splendor
KTM RC 200
Honda CB Unicorn 150
Royal Enfield Interceptor 650
Bajaj Dominar 400
Honda CBR250R
Honda Activa 125
Suzuki Intruder
KTM RC 390
TVS Apache RTR 180
Hero Passion Pro
Suzuki Access 125
Honda CB Shine SP
Honda Grazia
Suzuki Burgman Street
Bajaj Pulsar NS160
"""
models=models.split('\n')

In [173]:
from whoosh.qparser import QueryParser
from whoosh.query import Phrase, And,Term

qp = QueryParser("body", schema=ix.schema)
# q = qp.parse(u'"Amitabh Bachchan" AND "thugs of hindostan"')
# i=5
i+=1
model=models[i]
print(model)
q = qp.parse(u'"{}"'.format(model))
# q = And([Phrase('body', u"Amitabh Bachchan"), Phrase('body', u"Big B")])#Term('body',"Sholay")
# q = Phrase('body', '"dominar"')# u"amitabh bachchan")
# q = Term('body',"dominar")
s=ix.searcher()
hits = s.search(q)
print(len(hits))
from IPython.core.display import display, HTML
for hit in hits:
    display(HTML(hit.highlights('body',top=5,)))

Royal Enfield Interceptor 650
10


In [97]:
hits[0]

<Hit {'body': '\r\n\tBajaj Auto has silently hiked prices of its flagship motorcycle the 2018 Dominar 400 by Rs 2,000 for both the variants. Launched few months at a starting price of Rs 1.42 lakh (ex-showroom), the non-ABS version now costs Rs 1.44 lakh (ex-showroom) and the variant with dual-channel ABS is priced at Rs 1.56 lakh (ex-showroom).  \rThe 2018 Bajaj Dominar 400 gets new exterior colours including Rock Matte Black, Glacier Blue and Canyon Red shade along with already existing Midnight Blue, Twilight Plum, Moon White and Matt Black colors. Along with the new exterior colours the bike also gets new golden alloy wheels. \n\rThe Dominar 400 is powered by a 373cc triple spark four valves DTS-i engine with fuel injection and liquid cooling which produces 35PS of power and 35Nm of torque. The 373.2cc DTS-i engine delivers linear performance through its smooth six-speed transmission and slipper clutch.  \n\n \r\n The motorcycle has a large 43mm telescopic front fork with a dual sp

In [76]:
from urllib.parse import urlparse
urlparse('https://news18.com/news/auto/aston-martin-reveals-sports-car-for-the-skies-at-farnborough-airshow-1817055.html')

ParseResult(scheme='https', netloc='news18.com', path='/news/auto/aston-martin-reveals-sports-car-for-the-skies-at-farnborough-airshow-1817055.html', params='', query='', fragment='')

In [174]:
doc=articles.iloc[int(hits[0]['id'])].meta.story

In [144]:
q

And([Phrase('body', ['amitabh', 'bachchan'], slop=1, boost=1.000000), Term('body', 'piku')])

In [None]:
s.close()