In this notebook we will map each element of our DB to his identifier in the IMDB DB

In [198]:
import modules.import_data as import_data
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import re
import requests
import urllib.parse
from bs4 import BeautifulSoup
from queue import Queue
import random
import time
import threading
tqdm.pandas()

class Colors:
    INFO = '\033[0m\033[94m'
    INFO2 = '\033[0m\033[96m'
    SUCCESS = '\033[42m\033[37m'
    WARNING = '\033[93m'
    ERROR = '\033[41m\033[37m'
    FAIL = '\033[0m\033[91m'
    BOLD = '\033[0m\033[1m'

## Importing data

Import de la donnée des sous titres

In [2]:
with open("data/pickle/df_paths_series.pickle", 'rb') as file:
      df_paths_series = pickle.load(file)
df_paths_series

Unnamed: 0,serie,path_lemmatized
0,Lost,data/lemmatized_series/1___Lost.pickle
1,Heroes,data/lemmatized_series/2___Heroes.pickle
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle
3,Prison_Break,data/lemmatized_series/4___Prison_Break.pickle
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle
...,...,...
3522,JoJo_s_Bizarre_Adventure_(2012),data/lemmatized_series/5472___JoJo_s_Bizarre_A...
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...
3524,Modus,data/lemmatized_series/5478___Modus.pickle
3525,,data/lemmatized_series/5479________.pickle


Import de la donnée de IMDB

In [3]:
df_title_basics_imdb = pd.read_csv(
    "data/imdb/raw/title.basics.tsv", 
    sep="\t", 
    na_values="\\N", 
    low_memory=False
)

In [4]:
df_title_basics_imdb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"


## Preparing the data

On créer une colonne `serie_clean_name` avec laquelle on va essayer de faire matcher les données de IMDB

In [5]:
def preprocess_serie_name(text):
    res = text.replace("_", " ")
    regex_nb_par = re.compile(r"\([0-9]*\)")
    regex_app_s = re.compile(r" s ")
    res = re.sub(regex_nb_par, "", res)
    res = re.sub(regex_app_s, "'s ", res)
    res = res.rstrip(' ').lstrip(' ')
    return res 

In [6]:
df_paths_series["serie_clean_name"] = df_paths_series["serie"].progress_apply(preprocess_serie_name)
df_paths_series

100%|██████████| 3527/3527 [00:00<00:00, 268515.24it/s]


Unnamed: 0,serie,path_lemmatized,serie_clean_name
0,Lost,data/lemmatized_series/1___Lost.pickle,Lost
1,Heroes,data/lemmatized_series/2___Heroes.pickle,Heroes
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle,Jericho
3,Prison_Break,data/lemmatized_series/4___Prison_Break.pickle,Prison Break
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle,Supernatural
...,...,...,...
3522,JoJo_s_Bizarre_Adventure_(2012),data/lemmatized_series/5472___JoJo_s_Bizarre_A...,JoJo's Bizarre Adventure
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...,Auschwitz The Nazis and the Final Solution
3524,Modus,data/lemmatized_series/5478___Modus.pickle,Modus
3525,,data/lemmatized_series/5479________.pickle,


In [7]:
tmp = df_title_basics_imdb["titleType"].to_numpy()
df_title_basics_imdb_tv_series =  df_title_basics_imdb[(tmp == "tvSeries") | (tmp == "tvMiniSeries")]
del tmp

## Searching tconst of our series

In [8]:
def set_possible_tconst(i):
    clean_name = df_paths_series.loc[i, "serie_clean_name"]
    possible_tconst = df_title_basics_imdb_tv_series["tconst"][(df_title_basics_imdb_tv_series["primaryTitle"].to_numpy() == clean_name)]
    nb_possible_tconst = len(possible_tconst)
    possible_tconst_concat = possible_tconst.str.cat(sep="\\")
    df_paths_series.loc[i, "possible_tconst"] = possible_tconst_concat
    df_paths_series.loc[i, "nb_possible_tconst"] = nb_possible_tconst

In [9]:
df_paths_series.loc[:, 'index'] = np.arange(0, len(df_paths_series))
df_paths_series['index'].progress_apply(set_possible_tconst)
df_paths_series.drop("index", axis=1, inplace=True)

100%|██████████| 3527/3527 [00:32<00:00, 108.33it/s]


In [10]:
df_paths_series

Unnamed: 0,serie,path_lemmatized,serie_clean_name,possible_tconst,nb_possible_tconst
0,Lost,data/lemmatized_series/1___Lost.pickle,Lost,tt0292816\tt0411008\tt0485299\tt14609588\tt157...,5.0
1,Heroes,data/lemmatized_series/2___Heroes.pickle,Heroes,tt0291617\tt0452723\tt0802147\tt0813715\tt1279...,12.0
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle,Jericho,tt0059999\tt0437013\tt0805663\tt5178604,4.0
3,Prison_Break,data/lemmatized_series/4___Prison_Break.pickle,Prison Break,tt0455275,1.0
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle,Supernatural,tt0396375\tt0460681,2.0
...,...,...,...,...,...
3522,JoJo_s_Bizarre_Adventure_(2012),data/lemmatized_series/5472___JoJo_s_Bizarre_A...,JoJo's Bizarre Adventure,tt2359704,1.0
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...,Auschwitz The Nazis and the Final Solution,,0.0
3524,Modus,data/lemmatized_series/5478___Modus.pickle,Modus,tt21237458\tt4600404,2.0
3525,,data/lemmatized_series/5479________.pickle,,,0.0


In [11]:
df_paths_series[df_paths_series["nb_possible_tconst"] != 1]

Unnamed: 0,serie,path_lemmatized,serie_clean_name,possible_tconst,nb_possible_tconst
0,Lost,data/lemmatized_series/1___Lost.pickle,Lost,tt0292816\tt0411008\tt0485299\tt14609588\tt157...,5.0
1,Heroes,data/lemmatized_series/2___Heroes.pickle,Heroes,tt0291617\tt0452723\tt0802147\tt0813715\tt1279...,12.0
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle,Jericho,tt0059999\tt0437013\tt0805663\tt5178604,4.0
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle,Supernatural,tt0396375\tt0460681,2.0
7,Smallville,data/lemmatized_series/10___Smallville.pickle,Smallville,tt0279600\tt14921578,2.0
...,...,...,...,...,...
3520,Jekyll_and_Hyde,data/lemmatized_series/5469___Jekyll_and_Hyde....,Jekyll and Hyde,tt22080052\tt4224588,2.0
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...,Auschwitz The Nazis and the Final Solution,,0.0
3524,Modus,data/lemmatized_series/5478___Modus.pickle,Modus,tt21237458\tt4600404,2.0
3525,,data/lemmatized_series/5479________.pickle,,,0.0


### Using the IMDB Searchbar

In [211]:
df_paths_series_to_scrap = df_paths_series[df_paths_series["nb_possible_tconst"] != 1].reset_index()

In [212]:
df_paths_series_to_scrap

Unnamed: 0,index,serie,path_lemmatized,serie_clean_name,possible_tconst,nb_possible_tconst
0,362,Sabans_Diabolik,data/lemmatized_series/438___Sabans_Diabolik.p...,Sabans Diabolik,,0.0
1,489,Choch_Berry_Pie,data/lemmatized_series/569___Choch_Berry_Pie.p...,Choch Berry Pie,,0.0
2,1002,Big_Love_Webisodes,data/lemmatized_series/1146___Big_Love_Webisod...,Big Love Webisodes,,0.0
3,1284,Hamarinn_(The_Cliff),data/lemmatized_series/1466___Hamarinn_(The_Cl...,Hamarinn (The Cliff),,0.0
4,1292,Rejseholdet_(Unit_One),data/lemmatized_series/1478___Rejseholdet_(Uni...,Rejseholdet (Unit One),,0.0
5,1741,Real_Humans_(_kta_M_nniskor),data/lemmatized_series/2219___Real_Humans_(_kt...,Real Humans ( kta M nniskor),,0.0
6,1797,Riots_And_Revolutions__Nel_s_Arab_Journey,data/lemmatized_series/2368___Riots_And_Revolu...,Riots And Revolutions Nel's Arab Journey,,0.0
7,1977,C4_Comedy_Presents__Them_from_That_Thing,data/lemmatized_series/2821___C4_Comedy_Presen...,C4 Comedy Presents Them from That Thing,,0.0
8,2189,Grimm_Webisodes,data/lemmatized_series/3293___Grimm_Webisodes....,Grimm Webisodes,,0.0
9,2583,Reptilian_Battleground,data/lemmatized_series/4093___Reptilian_Battle...,Reptilian Battleground,,0.0


In [174]:
def scrap_most_relevent_tconst(serie_clean_name, socks5):
    url = "https://www.imdb.com/find/?q=" + urllib.parse.quote(serie_clean_name)
    # On ajoute un user agent, afin de contourner les protections contre les bots
    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.35',
    }
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        ul = soup.find_all("ul", class_="ipc-metadata-list")[0]
        tmp = ul.find_all("a")[0].get("href").replace("/title/", "")
        return tmp[:tmp.index('/')]
    except:
        return None

#### Proxies

In [203]:
proxies_str = """
socks4://:@51.83.116.10:31625
socks4://:@181.204.214.178:5968
socks4://:@190.138.250.48:3629
socks4://:@94.198.211.217:5678
socks4://:@216.65.153.164:57369
socks4://:@72.249.209.140:5678
socks4://:@181.113.135.254:50083
socks4://:@152.231.91.206:35010
socks4://:@45.128.133.209:1080
socks4://:@49.156.38.126:5678
socks4://:@190.182.88.214:30956
socks4://:@213.226.11.149:59086
socks4://:@203.153.125.13:65424
socks4://:@213.91.128.99:10801
socks4://:@45.228.147.204:4153
socks4://:@103.114.96.93:1080
socks4://:@142.166.131.50:5678
socks4://:@196.25.170.130:4145
socks5://sgvxptls:02a5uhzjxpxo@81.82.240.120:20947
socks4://:@180.210.222.233:1080
socks4://:@86.100.63.127:4145
socks4://:@190.249.169.153:3629
socks4://:@88.84.62.5:4153
socks4://:@51.222.108.216:25018
socks4://:@45.172.177.1:59341
socks4://:@185.93.240.133:10801
socks4://:@196.3.97.71:5678
socks4://:@193.106.57.96:5678
socks4://:@202.166.206.59:5678
socks4://:@187.19.127.246:8011
socks4://:@109.69.0.179:5678
socks4://:@89.133.13.97:4145
socks4://:@103.130.114.207:39163
socks4://:@94.72.158.129:4153
socks4://:@103.163.171.240:1088
socks4://:@146.196.63.161:4145
socks4://:@47.252.14.154:1080
socks4://:@92.245.102.242:1080
socks4://:@171.239.161.130:11218
socks4://:@200.111.158.234:5678
socks4://:@92.204.129.162:39875
socks4://:@197.211.24.206:5678
socks4://:@94.240.198.202:5678
socks4://:@174.138.176.78:54117
socks4://:@202.51.103.154:5678
socks4://:@184.168.121.153:47114
socks4://:@193.105.62.11:58973
socks4://:@41.79.10.218:4673
socks4://:@213.145.139.202:5678
socks4://:@110.78.149.200:4145
socks4://:@46.98.191.242:5678
socks4://:@91.243.192.17:3629
socks4://:@213.14.19.252:1080
socks4://:@66.23.233.210:32746
socks4://:@188.136.162.30:4153
socks4://:@129.205.198.134:5678
socks4://:@36.66.220.171:5678
socks4://:@103.119.67.243:1080
socks4://:@143.202.136.49:5678
socks4://:@103.110.89.78:5678
socks4://:@186.86.51.99:4153
socks4://:@79.137.203.245:1080
socks4://:@103.26.209.206:11080
socks4://:@198.211.110.125:7399
socks4://:@176.57.75.111:5678
socks4://:@142.93.6.130:9063
socks4://:@185.200.152.133:1080
socks4://:@109.167.134.253:44788
socks4://:@181.233.94.19:5678
socks4://:@58.147.171.106:10801
socks4://:@173.236.168.8:1048
socks5://sgvxptls:02a5uhzjxpxo@185.199.229.156:7492
socks4://:@185.209.220.255:1080
socks4://:@110.77.135.70:4145
socks4://:@80.241.44.34:5678
socks4://:@205.177.85.130:39593
socks4://:@81.199.14.49:1088
socks4://:@46.107.230.122:1080
socks4://:@103.53.110.45:10801
socks4://:@193.59.26.208:4153
socks4://:@202.131.235.138:4153
socks4://:@216.155.93.238:5678
socks4://:@89.218.5.106:50733
socks4://:@196.15.155.209:4145
socks4://:@95.158.174.111:1080
socks4://:@124.109.44.126:4145
socks4://:@41.223.65.158:4153
socks4://:@190.104.213.175:1080
socks4://:@68.183.16.50:47539
socks4://:@46.98.184.203:5678
socks4://:@143.255.141.113:5678
socks4://:@103.10.228.29:4145
socks4://:@104.236.114.255:27829
socks4://:@154.66.120.80:57775
socks4://:@190.129.48.62:1080
socks4://:@101.51.121.29:4153
socks4://:@89.161.88.30:5678
socks4://:@217.117.142.18:3629
socks4://:@38.91.106.214:48778
socks4://:@202.131.246.250:5678
socks4://:@103.167.134.71:7890
socks4://:@189.2.164.165:4153
socks4://:@185.216.18.138:44550
socks4://:@186.103.143.213:4153
socks4://:@96.9.80.75:1080
socks4://:@146.196.121.29:35010
socks4://:@185.32.4.65:4153
socks4://:@209.13.96.165:39921
socks4://:@159.224.243.185:61303
socks4://:@168.121.236.212:5678
socks4://:@91.221.240.253:1080
socks4://:@197.210.96.58:5678
socks4://:@180.92.212.178:5678
socks4://:@51.68.94.167:29707
socks4://:@89.38.11.16:60355
socks4://:@41.184.212.3:4153
socks4://:@103.40.122.194:1080
socks4://:@98.178.72.21:10919
socks4://:@177.36.185.180:5678
socks4://:@36.66.126.219:32000
socks4://:@147.135.5.177:59017
socks4://:@188.95.20.138:5678
socks4://:@77.77.26.152:4153
socks4://:@103.95.97.50:4153
socks4://:@45.73.0.118:5678
socks4://:@104.248.250.110:7510
socks4://:@141.105.107.34:5678
socks4://:@85.174.84.22:1080
socks4://:@70.80.75.236:5678
socks4://:@183.88.240.53:4145
socks4://:@103.168.207.9:3629
socks4://:@190.11.116.49:4153
socks4://:@138.255.240.66:40736
socks4://:@51.83.184.241:9191
socks4://:@82.209.165.206:4153
socks4://:@173.236.180.0:45508
socks4://:@167.99.182.125:7101
socks4://:@223.165.243.209:47205
socks4://:@45.166.26.129:53695
socks4://:@116.206.233.78:4153
socks4://:@200.0.247.82:4153
socks4://:@103.111.160.41:5678
socks4://:@103.60.214.18:51754
socks4://:@185.87.121.5:8975
socks4://:@200.105.192.6:5678
socks4://:@91.199.93.32:4153
socks4://:@115.127.79.234:1080
socks4://:@195.138.65.34:5678
socks4://:@185.220.86.47:5678
socks4://:@89.41.106.8:4145
socks4://:@45.128.133.141:1080
socks4://:@106.240.89.60:4145
socks4://:@202.77.121.178:4153
socks4://:@202.58.199.229:5678
socks4://:@65.20.235.3:55555
socks4://:@51.158.103.11:16379
socks4://:@181.205.46.178:4666
socks4://:@161.0.39.78:1080
socks4://:@103.229.83.106:6789
socks4://:@8.39.228.33:39593
socks4://:@117.20.58.242:5678
socks4://:@212.5.132.74:5678
socks4://:@5.160.61.122:4145
socks4://:@45.122.44.2:5678
socks4://:@110.238.109.146:8001
socks4://:@202.46.91.218:12391
socks4://:@168.232.60.89:5678
socks4://:@203.194.21.241:4153
socks4://:@45.238.57.1:3629
socks4://:@49.0.246.130:1000
socks4://:@208.97.186.78:15248
socks4://:@51.83.116.4:53425
socks4://:@216.65.148.83:64483
socks4://:@95.140.117.10:1080
socks4://:@46.172.75.51:5678
socks4://:@197.250.15.87:5678
socks4://:@103.52.252.18:5678
socks4://:@103.171.150.158:1081
socks4://:@91.121.48.221:38711
socks4://:@45.128.133.225:1080
socks4://:@168.121.253.34:1080
socks4://:@154.66.108.34:10081
socks4://:@1.32.59.217:31981
socks4://:@203.205.34.58:5678
socks4://:@181.174.85.78:5678
socks4://:@197.231.205.96:5678
socks4://:@190.2.136.35:31819
socks4://:@46.171.28.162:59311
socks4://:@51.77.141.29:50450
socks5://sgvxptls:02a5uhzjxpxo@2.56.119.93:5074
socks4://:@81.7.86.154:4145
socks4://:@177.93.78.25:4153
socks4://:@154.94.0.133:5678
socks4://:@51.158.119.71:16379
socks4://:@177.234.245.243:32213
socks4://:@103.127.204.108:24963
socks4://:@160.226.139.135:1080
socks4://:@43.230.196.98:48200
socks4://:@91.92.78.207:4145
socks4://:@37.57.15.43:47233
socks4://:@103.231.33.64:65416
socks4://:@202.148.22.106:5678
socks4://:@5.133.24.210:1080
socks4://:@103.10.208.220:4153
socks4://:@95.165.163.188:36496
socks4://:@146.59.178.222:35222
socks4://:@182.16.171.65:51459
socks4://:@200.0.247.84:4153
socks4://:@216.65.148.101:64483
socks4://:@182.253.40.55:4153
socks4://:@193.59.26.230:4145
socks4://:@178.72.90.70:5678
socks4://:@176.119.227.65:5678
socks4://:@119.15.89.87:5678
socks4://:@117.102.101.52:5678
socks4://:@51.83.116.9:46127
socks4://:@213.6.77.198:5678
socks4://:@85.196.151.2:4153
socks4://:@217.73.171.50:5678
socks4://:@103.161.68.12:1080
socks4://:@202.164.209.55:56789
socks4://:@70.32.25.164:44441
socks4://:@91.213.119.246:46024
socks4://:@185.186.17.57:5678
socks4://:@213.171.44.82:3629
socks4://:@203.174.26.129:4153
socks4://:@198.20.190.61:31444
socks4://:@212.126.5.242:42344
socks4://:@209.198.43.6:5678
socks4://:@82.147.123.186:10808
socks4://:@182.52.58.44:4153
socks4://:@185.154.204.91:4153
socks4://:@138.97.221.0:35010
socks4://:@181.143.59.140:4153
socks4://:@114.134.90.43:5678
socks4://:@51.158.98.197:16379
socks4://:@136.228.160.250:5678
socks4://:@188.75.186.152:4145
socks4://:@199.127.176.139:64312
socks4://:@182.23.36.82:4153
socks4://:@174.138.62.182:27711
socks4://:@103.81.196.125:5678
socks4://:@200.174.228.183:4153
socks4://:@115.242.204.122:5678
socks4://:@125.228.94.199:4145
socks4://:@77.242.133.172:5678
socks4://:@82.114.92.222:4145
socks4://:@5.58.47.25:3629
socks4://:@103.163.50.65:1080
socks4://:@213.6.36.146:5678
socks5://sgvxptls:02a5uhzjxpxo@188.74.210.207:6286
socks4://:@91.150.77.57:56921
socks4://:@103.174.36.112:5678
socks4://:@152.70.244.240:16238
socks4://:@51.52.205.98:4153
socks4://:@51.222.241.8:23392
socks4://:@213.19.205.18:54321
socks4://:@103.140.35.11:4145
socks4://:@110.235.250.155:1080
socks4://:@111.68.127.170:4153
socks4://:@186.97.172.178:5678
socks4://:@190.120.252.113:4145
socks4://:@181.48.70.30:4153
socks4://:@74.62.23.242:39593
socks4://:@78.31.92.145:1080
socks4://:@103.220.205.162:4673
socks4://:@113.160.106.45:4153
socks4://:@121.139.218.165:43295
socks4://:@1.179.130.201:4153
socks4://:@103.167.170.86:1080
socks4://:@200.0.247.83:4153
socks4://:@94.232.145.158:5678
socks4://:@103.127.63.57:5678
socks4://:@117.102.72.114:4153
socks4://:@200.43.231.8:4153
socks4://:@190.5.234.34:5678
socks4://:@176.123.218.6:18080
socks4://:@195.177.217.131:26989
socks5://sgvxptls:02a5uhzjxpxo@185.199.231.45:8382
socks4://:@103.124.182.150:1080
socks4://:@189.201.191.18:4145
socks4://:@195.138.73.54:31145
socks4://:@79.125.195.102:5678
socks4://:@212.83.143.204:39724
socks4://:@202.183.155.242:4153
socks4://:@175.144.198.226:31694
socks4://:@117.198.221.34:4153
socks4://:@116.199.170.1:4145
socks4://:@95.158.141.62:25566
socks4://:@109.166.207.162:3629
socks4://:@196.0.111.194:48009
socks4://:@1.179.151.165:31948
socks4://:@93.185.74.214:9080
socks4://:@176.99.2.43:1080
socks4://:@91.194.239.122:5678
socks4://:@93.184.4.254:1080
socks4://:@92.241.87.14:5678
socks4://:@200.118.122.6:4153
socks4://:@171.103.104.46:1099
socks4://:@103.155.166.254:1080
socks4://:@167.99.151.120:30418
socks4://:@130.255.160.116:64398
socks4://:@89.208.199.134:14992
socks4://:@45.127.120.178:5678
socks4://:@138.0.60.19:1080
socks4://:@210.86.173.42:4153
socks4://:@175.139.179.65:41527
socks4://:@103.37.82.134:39873
socks4://:@113.190.253.76:5678
socks4://:@50.207.130.198:54321
socks4://:@195.162.71.27:3629
socks4://:@86.110.189.154:4145
socks4://:@27.72.122.228:51067
socks4://:@150.129.109.14:5678
socks4://:@51.83.116.2:49794
socks4://:@212.83.143.147:49137
socks4://:@209.240.50.56:39593
socks4://:@77.89.196.202:4153
socks4://:@31.163.204.200:4153
socks4://:@51.255.80.151:42304
socks4://:@47.243.88.120:5555
socks4://:@32.140.114.38:1080
socks4://:@1.20.137.82:32241
socks4://:@192.162.232.15:1080
socks4://:@203.154.232.25:4153
socks4://:@103.160.201.76:1080
socks4://:@104.236.0.129:35917
socks4://:@93.175.194.154:3629
socks4://:@202.150.157.70:4153
socks4://:@93.91.118.141:3629
socks4://:@201.158.120.44:45504
socks4://:@176.117.237.132:1080
socks4://:@159.203.65.130:32769
socks4://:@143.137.116.142:1080
socks4://:@164.138.43.14:1080
socks4://:@50.63.165.137:47818
socks4://:@118.67.220.225:4145
socks4://:@190.151.166.15:4153
socks4://:@36.67.88.77:4153
socks4://:@216.65.153.161:57369
socks4://:@208.113.198.147:5802
socks4://:@91.205.131.110:53339
socks4://:@162.255.108.225:5678
socks4://:@118.174.21.117:13629
socks4://:@212.41.1.163:1080
socks4://:@38.143.136.41:32767
socks4://:@138.122.164.201:57775
socks4://:@77.39.8.165:3629
socks4://:@174.138.56.41:48387
socks4://:@196.61.44.54:5678
socks4://:@150.129.57.251:4153
socks4://:@27.72.59.99:5678
socks4://:@216.65.148.87:64483
socks4://:@109.167.113.12:4153
socks4://:@5.8.240.90:4153
socks5://sgvxptls:02a5uhzjxpxo@188.74.210.21:6100
socks4://:@119.18.146.139:4153
socks4://:@113.176.195.145:4153
socks4://:@14.161.25.229:5678
socks4://:@124.105.55.176:30906
socks4://:@187.32.20.249:5678
socks4://:@177.86.24.130:4153
socks4://:@216.65.153.173:57369
socks4://:@103.183.60.226:32767
socks4://:@137.74.173.77:47906
socks4://:@185.43.249.148:39316
socks4://:@103.147.119.3:4145
socks4://:@103.127.38.46:1080
socks4://:@109.75.254.91:1080
socks4://:@190.144.224.182:44550
socks4://:@36.95.189.165:5678
socks4://:@103.114.96.125:8291
socks4://:@103.210.29.193:31433
socks5://sgvxptls:02a5uhzjxpxo@45.94.47.66:8110
socks4://:@14.161.14.106:5678
socks4://:@103.134.214.130:1648
socks4://:@91.193.125.123:3629
socks4://:@139.59.7.217:54554
socks4://:@47.243.197.34:61505
socks4://:@212.83.143.118:24144
socks4://:@162.202.70.105:1888
socks4://:@36.67.14.195:5678
socks4://:@175.100.47.191:5678
socks4://:@45.65.65.18:4145
socks4://:@183.88.247.52:4153
socks4://:@200.214.154.135:4145
socks4://:@190.89.89.157:4153
socks4://:@190.180.35.146:5678
socks4://:@112.221.46.117:4153
socks4://:@197.234.58.102:32767
socks4://:@192.99.244.173:15590
socks4://:@82.147.123.185:10808
socks4://:@109.69.161.131:10801
socks4://:@46.173.211.166:1080
socks4://:@196.250.39.65:5678
socks4://:@202.29.245.46:32241
socks4://:@186.219.96.47:49923
socks4://:@159.203.30.119:16884
socks4://:@103.122.85.117:35010
socks4://:@49.156.42.186:5678
socks4://:@200.212.2.94:61283
socks4://:@103.38.103.18:1080
socks4://:@216.65.148.98:64483
socks4://:@37.187.91.192:11721
socks4://:@103.15.245.18:4153
socks4://:@89.133.95.177:4145
socks4://:@36.92.138.51:5678
socks4://:@1.53.137.84:4145
socks4://:@89.58.45.94:45702
socks4://:@203.189.159.33:35010
socks5://sgvxptls:02a5uhzjxpxo@185.199.228.220:7300
socks4://:@46.188.2.42:5678
socks4://:@188.255.244.213:1080
socks4://:@178.218.201.63:1080
socks4://:@103.138.22.165:32000
socks4://:@202.138.242.6:38373
socks4://:@45.7.200.149:4145
socks4://:@170.81.108.45:4153
socks4://:@41.180.96.110:4153
socks4://:@119.10.177.107:1080
socks5://sgvxptls:02a5uhzjxpxo@188.74.183.10:8279
socks4://:@190.123.226.109:5678
socks4://:@202.149.89.70:7999
socks4://:@37.238.134.130:31772
socks4://:@5.56.124.176:8192
socks4://:@119.110.75.78:35010
socks4://:@190.4.205.226:4153
socks4://:@94.180.217.100:4145
socks4://:@180.180.171.113:4145
socks4://:@31.43.33.56:4153
socks4://:@31.170.17.141:4153
socks4://:@62.122.201.246:50129
socks4://:@189.195.176.99:5678
socks4://:@103.235.199.100:25566
socks4://:@67.159.245.157:4153
socks4://:@162.0.220.220:59864
socks4://:@45.236.185.1:4153
socks4://:@138.36.150.16:1080
socks4://:@12.89.124.138:4145
socks4://:@185.184.197.108:5678
socks4://:@134.209.29.208:1080
socks4://:@94.181.178.152:1080
socks4://:@117.4.242.216:5678
socks4://:@36.67.40.2:4153
socks4://:@203.130.18.122:45919
socks4://:@185.95.199.103:1099
socks4://:@103.164.190.221:5430
socks4://:@83.168.84.134:4153
socks4://:@210.245.51.230:9898
socks4://:@103.166.32.130:11080
socks4://:@54.39.87.232:39721
socks4://:@200.24.148.132:3629
socks4://:@27.72.73.143:4153
socks4://:@103.105.103.1:3629
socks4://:@170.254.255.232:45816
socks4://:@220.132.181.64:21
socks4://:@197.232.47.122:5678
socks4://:@109.94.178.238:3629
socks4://:@80.80.152.123:5678
socks4://:@78.9.110.94:1080
socks4://:@202.46.145.4:8080
socks4://:@46.10.208.106:8192
socks4://:@208.175.137.169:5678
socks4://:@182.52.67.122:50801
socks4://:@37.255.183.2:3629
socks4://:@202.144.134.150:5678
socks4://:@193.200.151.158:8192
socks4://:@45.33.21.96:35500
socks5://sgvxptls:02a5uhzjxpxo@45.155.68.129:8133
socks4://:@103.148.225.6:5678
socks4://:@170.78.92.6:5678
socks5://sgvxptls:02a5uhzjxpxo@154.95.36.199:6893
socks4://:@109.238.223.1:51372
socks4://:@1.221.173.148:4145
socks4://:@212.39.114.139:5678
socks4://:@77.104.75.97:5678
socks4://:@118.67.216.94:4145
socks4://:@117.220.229.148:5678
socks4://:@221.121.12.238:47012
socks4://:@93.117.72.27:55770
socks4://:@31.220.43.141:13396
socks4://:@62.133.135.129:4153
socks4://:@103.106.216.82:4145
socks4://:@103.254.167.130:1080
socks4://:@80.63.107.90:4153
socks4://:@201.46.29.115:5678
socks4://:@117.74.125.210:1133
socks4://:@113.53.247.221:4153
socks4://:@110.232.86.22:5678
socks4://:@72.250.17.184:39593
socks4://:@179.108.158.204:4145
socks4://:@118.137.56.108:1080
socks4://:@91.185.236.24:4145
socks4://:@5.39.68.33:60560
socks4://:@125.227.225.157:3389
socks4://:@200.27.110.28:57702
socks4://:@195.22.221.210:4145
socks4://:@41.139.250.223:5678
socks4://:@213.6.68.210:4145
socks4://:@105.208.44.53:5678
socks4://:@216.65.153.169:57369
socks4://:@116.118.98.9:5678
socks4://:@88.255.40.194:65484
socks4://:@103.110.10.154:4153
socks4://:@202.29.214.22:4153
socks4://:@74.56.228.180:4145
socks4://:@86.111.144.10:4145
socks4://:@186.1.206.154:1080
socks4://:@89.22.17.62:43110
socks4://:@202.43.112.37:4145
socks4://:@200.80.227.234:4145
socks4://:@186.145.192.251:5678
socks4://:@85.237.62.189:3629
socks4://:@188.175.223.113:5678
socks4://:@185.103.14.155:4153
socks4://:@103.103.143.63:5678
socks4://:@167.71.220.29:7497
socks4://:@36.88.62.175:7511
socks4://:@105.214.24.241:5678
socks4://:@190.181.140.90:5678
socks4://:@188.170.114.218:3629
socks4://:@103.247.22.52:12
socks4://:@85.100.40.12:5678
socks4://:@62.60.162.30:3128
socks4://:@94.73.251.19:1080
socks4://:@196.0.111.186:46048
socks4://:@154.79.242.178:10801
socks4://:@216.65.153.175:57369
socks4://:@45.249.79.190:3629
socks4://:@212.156.217.147:4153
socks4://:@113.53.29.228:13629
socks4://:@92.207.253.226:4145
socks4://:@103.111.22.26:58563
socks4://:@194.8.145.174:5678
socks4://:@110.78.82.233:5678
socks4://:@188.237.60.27:1080
socks4://:@154.79.246.18:9898
socks4://:@2.57.131.19:4145
socks4://:@216.65.148.88:64483
socks4://:@103.59.190.2:56252
socks4://:@110.78.82.70:5678
socks4://:@200.115.157.211:4145
socks4://:@178.215.163.218:4145
socks4://:@200.85.139.58:4153
socks4://:@205.177.85.134:39593
socks4://:@103.167.171.13:1026
socks4://:@179.96.251.161:5678
socks4://:@164.52.42.6:4145
socks4://:@202.151.163.10:1080
socks4://:@77.225.198.220:4673
socks4://:@203.17.65.47:4153
socks4://:@190.57.131.158:1080
socks4://:@103.124.197.78:1080
socks4://:@50.235.92.65:32100
socks4://:@188.191.164.55:4890
socks4://:@95.86.206.6:8080
socks4://:@91.150.77.58:56921
socks4://:@103.213.118.46:1080
socks4://:@103.110.59.3:35294
socks4://:@118.173.230.19:1080
socks4://:@202.137.141.26:5678
socks4://:@70.32.26.101:37612
socks4://:@194.124.37.101:10820
socks4://:@45.70.153.16:4153
socks4://:@185.131.240.104:4153
socks4://:@103.139.246.166:5678
socks4://:@128.127.94.160:5678
socks4://:@167.172.130.72:1080
socks4://:@183.80.130.10:4145
socks4://:@170.247.43.142:32812
socks4://:@103.158.130.214:12391
socks4://:@31.10.63.224:32000
socks4://:@85.215.229.3:125
socks4://:@41.169.145.194:1080
socks4://:@46.160.90.81:5678
socks4://:@46.225.251.206:1080
socks4://:@103.112.128.37:9091
socks4://:@194.37.97.189:51080
socks4://:@180.92.212.200:5678
socks4://:@103.36.11.158:4145
socks4://:@103.150.115.186:4153
socks4://:@45.234.67.62:5678
socks4://:@103.150.110.202:9969
socks4://:@202.40.181.220:31247
socks4://:@202.180.8.145:1080
socks4://:@87.110.237.97:57928
socks4://:@185.51.92.103:51327
socks4://:@202.138.249.15:3629
socks4://:@109.87.172.133:5678
socks4://:@185.126.44.135:5678
socks4://:@38.113.171.88:57775
socks4://:@85.217.192.39:4145
socks4://:@191.7.208.100:31576
socks4://:@36.88.247.90:5678
socks4://:@203.142.74.250:5678
socks4://:@203.128.83.170:4153
socks4://:@185.89.65.170:33744
socks4://:@51.83.190.248:19050
socks4://:@46.40.60.108:52088
socks4://:@201.184.239.75:5678
socks4://:@103.160.12.138:4145
socks4://:@46.0.203.140:4890
socks4://:@103.148.92.1:5678
socks4://:@165.227.153.96:59166
socks4://:@36.91.45.11:51299
socks4://:@93.90.212.2:4153
socks4://:@202.29.220.202:61507
socks4://:@103.176.96.195:1080
socks4://:@213.32.252.134:5678
socks4://:@102.244.120.10:45413
socks4://:@186.66.97.94:5678
socks4://:@179.107.51.69:4153
socks4://:@139.99.233.80:1080
socks4://:@86.110.189.118:42539
socks4://:@212.126.5.245:42344
socks4://:@184.149.25.55:5678
socks4://:@101.96.117.169:1080
socks4://:@111.90.169.178:5678
socks4://:@155.254.9.2:36510
socks4://:@189.201.187.3:4145
socks4://:@45.127.134.139:1080
socks4://:@46.173.35.229:3629
socks4://:@188.134.1.49:3629
socks4://:@185.136.151.138:5678
socks4://:@45.185.236.254:1080
socks4://:@176.197.234.202:4153
socks4://:@82.103.70.227:4145
socks4://:@95.128.142.76:1080
socks4://:@87.121.18.154:5678
socks4://:@95.31.5.29:51528
socks4://:@222.165.223.138:41541
socks4://:@167.99.3.169:20262
socks4://:@122.252.179.66:5678
"""

#### -

In [204]:
proxies_list = proxies_str.split('\n')[1:-1]
nb_proxies = len(proxies_list)

Fonction utilisée pour scrapper des socks

In [205]:
#headers = {
#    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.35',
#}
#response = requests.get("https://www.socks-proxy.net/", headers=headers)
#soup = BeautifulSoup(response.text, "html.parser")
#for elem in table[0].find_all("tr")[1:]:
#    cells = elem.find_all("td")
#    ip = cells[0].text
#    port = cells[1].text
#    if f"socks4://:@{ip}:{port}" not in set(proxies_list):
#        print(f"socks4://:@{ip}:{port}")
#        proxies_list.append(f"socks4://:@{ip}:{port}")

Nous allons utiliser une structure du Queue (FIFO), afin de ne pas surcharger l'un des proxy. De plus, afin de contourner les protections contre le scapping, nous allons utiliser un delay aléatoire entre chacune de nos requête.

In [206]:
proxies_list = proxies_str.split('\n')[1:-1]
nb_proxies = len(proxies_list)
proxies_queue = Queue(nb_proxies)
proxies_dead = []
for proxy in proxies_list:
    proxies_queue.put(proxy)
max_threads = nb_proxies
semaphore = threading.Semaphore(max_threads)
kill = False

In [207]:
def scrape_thread(title, index):
    global kill
    
    retry = True
    isStarted = False
    with semaphore:
        if kill:
            print(f"{Colors.WARNING}Killed")
            return
        while retry and not len(proxies_dead) == nb_proxies:
            try:
                proxy = proxies_queue.get(timeout=10)
                print(f"{Colors.INFO}Working with {proxy}")
            except:
                return
            isStarted = True
            tconst = scrap_most_relevent_tconst(title, proxy)
            if tconst is None:
                print(f"{Colors.ERROR}Proxy {proxy} seems down")
                proxies_dead.append(proxy)
            else:
                df_paths_series.loc[index, "possible_tconst"] = tconst
                df_paths_series.loc[index, "nb_possible_tconst"] = 1
                delay = random.randint(1300, 2500)
                time.sleep(delay/1000)
                proxies_queue.put(proxy)
                print(f"{Colors.INFO2}Freed with {proxy}")
                retry = False
                
        if len(proxies_dead) == nb_proxies and isStarted:
            print(f"{Colors.FAIL}Stopped at iter {index}")
            kill = True
            

In [208]:
threads = []

for i in range(len(df_paths_series_to_scrap)):
    title = df_paths_series_to_scrap.loc[i, "serie_clean_name"]
    index = df_paths_series_to_scrap.loc[i, "index"]
    
    thread = threading.Thread(target=scrape_thread, args=(title, index))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

[0m[94mWorking with socks4://:@51.83.116.10:31625
[0m[94mWorking with socks4://:@181.204.214.178:5968
[0m[94mWorking with socks4://:@190.138.250.48:3629
[0m[94mWorking with socks4://:@94.198.211.217:5678
[0m[94mWorking with socks4://:@216.65.153.164:57369
[0m[94mWorking with socks4://:@72.249.209.140:5678
[0m[94mWorking with socks4://:@181.113.135.254:50083
[0m[94mWorking with socks4://:@152.231.91.206:35010
[0m[94mWorking with socks4://:@45.128.133.209:1080
[0m[94mWorking with socks4://:@49.156.38.126:5678
[0m[94mWorking with socks4://:@190.182.88.214:30956
[0m[94mWorking with socks4://:@213.226.11.149:59086
[0m[94mWorking with socks4://:@203.153.125.13:65424
[0m[94mWorking with socks4://:@213.91.128.99:10801
[0m[94mWorking with socks4://:@45.228.147.204:4153
[41m[37mProxy socks4://:@49.156.38.126:5678 seems down
[0m[94mWorking with socks4://:@103.114.96.93:1080
[41m[37mProxy socks4://:@181.204.214.178:5968 seems down
[0m[94mWorking with socks4://

In [213]:
df_paths_series

Unnamed: 0,serie,path_lemmatized,serie_clean_name,possible_tconst,nb_possible_tconst
0,Lost,data/lemmatized_series/1___Lost.pickle,Lost,tt0411008,1.0
1,Heroes,data/lemmatized_series/2___Heroes.pickle,Heroes,tt0813715,1.0
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle,Jericho,tt0805663,1.0
3,Prison_Break,data/lemmatized_series/4___Prison_Break.pickle,Prison Break,tt0455275,1.0
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle,Supernatural,tt0460681,1.0
...,...,...,...,...,...
3522,JoJo_s_Bizarre_Adventure_(2012),data/lemmatized_series/5472___JoJo_s_Bizarre_A...,JoJo's Bizarre Adventure,tt2359704,1.0
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...,Auschwitz The Nazis and the Final Solution,tt0446610,1.0
3524,Modus,data/lemmatized_series/5478___Modus.pickle,Modus,tt4600404,1.0
3525,,data/lemmatized_series/5479________.pickle,,,0.0


In [227]:
df_series_usable = df_paths_series[df_paths_series["nb_possible_tconst"] == 1]
df_series_usable = df_series_usable.drop(["serie_clean_name", "nb_possible_tconst"], axis=1).rename(columns={"possible_tconst": "tconst"})
df_series_usable

Unnamed: 0,serie,path_lemmatized,tconst
0,Lost,data/lemmatized_series/1___Lost.pickle,tt0411008
1,Heroes,data/lemmatized_series/2___Heroes.pickle,tt0813715
2,Jericho_(2006),data/lemmatized_series/3___Jericho_(2006).pickle,tt0805663
3,Prison_Break,data/lemmatized_series/4___Prison_Break.pickle,tt0455275
4,Supernatural,data/lemmatized_series/5___Supernatural.pickle,tt0460681
...,...,...,...
3521,Con_Man,data/lemmatized_series/5471___Con_Man.pickle,tt4642170
3522,JoJo_s_Bizarre_Adventure_(2012),data/lemmatized_series/5472___JoJo_s_Bizarre_A...,tt2359704
3523,Auschwitz__The_Nazis_and_the_Final_Solution,data/lemmatized_series/5477___Auschwitz__The_N...,tt0446610
3524,Modus,data/lemmatized_series/5478___Modus.pickle,tt4600404


Nous avons seulement 15 séries pour lequelles nous n'avons pas réussi à trouver l'id correspondant dans la base de donnée IMDB

On enregistre ce Dataframe

In [230]:
import_data.save_array_to_path(df_series_usable, "data/imdb/scrap/mapping_transcripts_imdb_tconst.pickle")