<a href="https://colab.research.google.com/github/addicted-ai/fastai-tests/blob/main/trends_from_image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 39 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 1s (5,347 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl

In [2]:
import pandas as pd
import numpy as np

import datetime
from IPython.display import Image
from PIL import Image

import cv2
import os
from pathlib import Path
import glob

from hashlib import md5

import pytesseract

import matplotlib.pyplot as plt


In [4]:
!git clone https://github.com/fischerbach/fischerbach.github.io.git

Cloning into 'fischerbach.github.io'...
remote: Enumerating objects: 768, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 768 (delta 3), reused 11 (delta 2), pack-reused 753[K
Receiving objects: 100% (768/768), 125.16 MiB | 37.49 MiB/s, done.
Resolving deltas: 100% (64/64), done.
Checking out files: 100% (732/732), done.


In [5]:
WORK_PATH = '/content/fischerbach.github.io/trend_monitoring'

In [6]:
def extract_pictures_from_screenshot(filename, output_dir):
  minimum_width = 100
  minimum_height = 100
  
  Path(output_dir).mkdir(parents=True, exist_ok=True)

  image = cv2.imread(filename)

  gray = cv2.cvtColor(image,  cv2.COLOR_BGR2GRAY)
  thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

  ROI_number = 1
  cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  cnts = cnts[0] if len(cnts) == 2 else cnts[1]

  for c in cnts:
     x, y, w, h = cv2.boundingRect(c)
     if w >= minimum_width and h >= minimum_height:
         ROI = image[y:y+h, x:x+w]
         out_image = os.path.join(output_dir, f'{Path(filename).stem}_{ROI_number}.png')
         cv2.imwrite(out_image, ROI)
         ROI_number += 1

In [7]:
def extract_words(filename, verbose=False):
  d = pytesseract.image_to_data(Image.open(filename), output_type=pytesseract.Output.DICT)
  result = {}
  n_boxes = len(d['text'])
  for i in range(n_boxes):
    word = d['text'][i].strip().lower()
    if int(d['conf'][i]) > 60 and len(d['text'][i]) >= 4:
      if word not in list(result.keys()):
        result[word] = d['width'][i]*d['height'][i]
      else:
        result[word] += d['width'][i]*d['height'][i]
      if(verbose):
        display(f"{word} ==> {result[word]}")
  return result

In [8]:
from wordcloud import WordCloud

def generate_wordcloud(words_dict, output_filename):
  text_normalized = {}
  text_weights_sum = sum(words_dict.values())
  for k, w in words_dict.items():
    text_normalized[k] = w  / text_weights_sum
  wordcloud = WordCloud(width=4000, height=3000, background_color="white").generate_from_frequencies(text_normalized)
  plt.figure(figsize=(40,30))
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.savefig(output_filename)
  plt.close()

In [9]:
from shutil import copyfile
import json

def generate_dataset(input_dir):
  images = glob.glob(f'{input_dir}/*.png')

  for image in images:
    date = os.path.basename(image)[:10]
    dir = os.path.dirname(image)
    out = os.path.join(dir, date)
    Path(out).mkdir(parents=True, exist_ok=True)
  
    copyfile(image, os.path.join(out, os.path.basename(image)))

    extract_pictures_from_screenshot(image, os.path.join(out, f'extracted_pictures'))

    words = extract_words(image)
    with open(os.path.join(out, f'words.json'),"w") as words_file:
      words_file.write(json.dumps(words))
    
    generate_wordcloud(words, os.path.join(out, f"wordcloud_{date}.png"))

In [10]:
generate_dataset(f'{WORK_PATH}/screenshots/www.bloomberg.com')

In [None]:
generate_dataset(f'{WORK_PATH}/screenshots/nypost.com')