In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from itertools import combinations
import fs_pp

In [None]:
pd.set_option('display.max_rows', None)

# Importando dados

In [2]:
dados = pd.read_csv('https://drive.google.com/uc?export=download&id=1HzSQN0r64a6a3vkPcpPyeKjk-BDFWyPc', error_bad_lines=False)
dados.sample(3)

b'Skipping line 1974: expected 13 fields, saw 14\nSkipping line 4350: expected 13 fields, saw 14\nSkipping line 4631: expected 13 fields, saw 24\nSkipping line 5710: expected 13 fields, saw 24\nSkipping line 6525: expected 13 fields, saw 24\nSkipping line 6919: expected 13 fields, saw 16\nSkipping line 6930: expected 13 fields, saw 14\nSkipping line 7349: expected 13 fields, saw 16\nSkipping line 7367: expected 13 fields, saw 15\nSkipping line 8897: expected 13 fields, saw 24\nSkipping line 9876: expected 13 fields, saw 24\nSkipping line 9941: expected 13 fields, saw 17\nSkipping line 10530: expected 13 fields, saw 16\nSkipping line 10531: expected 13 fields, saw 17\nSkipping line 10532: expected 13 fields, saw 18\nSkipping line 11839: expected 13 fields, saw 18\nSkipping line 12328: expected 13 fields, saw 14\nSkipping line 12540: expected 13 fields, saw 23\nSkipping line 13098: expected 13 fields, saw 36\nSkipping line 13758: expected 13 fields, saw 16\nSkipping line 13800: expected 

Unnamed: 0,H,title,@1922,D:Hitchcock,prds,st,prc,prc.1,cat,aw,locale,"Nt(first, all)",|
3206,HHa3,T:Man of the Forest,1933,D:Hathaway,PN:,SU:,bnw,West,,West,|,,
1590,TWh30,T:St.~Martin's Lane,1938,D:Whelan,PN:,SU:,prc,Ctxx,aw,lc,,|,
844,FLl7,T:Les Mis\'erables,1918,D:F.Lloyd,PN:,"SL:USA,England",,Dram,aw,lc,Nt(\#3),|,


# Filtrando colunas de possível relevancia para modelo

In [3]:
filter_columns = dados[['D:Hitchcock','prds','prc','st']]
filter_columns.sample(3)

Unnamed: 0,D:Hitchcock,prds,prc,st
7784,D:R.Lester,"PN:Alex Winitsky, Arlene Sellers",\Tcol,St:U.A.
976,D:Niblo,PN:,,SU:
11349,D:Lyne,PN:,prc,SU:


# Renomeando colunas

In [4]:
filter_columns.columns=['diretor','produtor','genero','estudio']
filter_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14402 entries, 0 to 14401
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   diretor   14395 non-null  object
 1   produtor  14385 non-null  object
 2   genero    13725 non-null  object
 3   estudio   14387 non-null  object
dtypes: object(4)
memory usage: 450.2+ KB


# Padronizando Tipos de dados para fetures/target

In [5]:
columns= ['diretor','produtor','genero','estudio']
format_type = fs_pp.to_type(filter_columns,columns, 'category')
format_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14402 entries, 0 to 14401
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   diretor   14395 non-null  category
 1   produtor  14385 non-null  category
 2   genero    13725 non-null  category
 3   estudio   14387 non-null  category
dtypes: category(4)
memory usage: 339.4 KB


# Removendo inconsistência de dados categóricos ou atribuindo NaN a um dado irrelevante
*   Eliminação de caracteres e marcadores indesejados (D:, F:, etc)
*   Eliminação de multiplas categorias em um mesmo exemplo (considerar o primeiro)



In [6]:
clean_elements = fs_pp.remove_incoherence(format_type, r'D:|F:|P:|PN:|SU:|St:|SL:|S:|PU:', r'')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'(.*)\,.+', r'\1')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'(.*);.*', r'\1')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'.*\\(.*)', r'\1')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'(.*)\\.*', r'\1')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'(.*) and .*', r'\1')
clean_elements = fs_pp.remove_incoherence(clean_elements, r'^st\.$|^st$|^prds$|^$|^\s+$', np.nan)
clean_elements.sample(10)

  return func(self, *args, **kwargs)


Unnamed: 0,diretor,produtor,genero,estudio
8983,Waters,,col,
5017,D.Siegel,D.Siegel,Pathecolor,Universal
7245,N.Lloyd,Hitchcock,bnw,Shamley
996,K.Vidor,,prc,
3980,A.Malraux,A.Malraux,prc,
10635,G.Murphy,,prc,NZ
6992,Mulligan,,prc,
9918,Bass,Paul B. Radin,Tcol,Paramount
9482,Annaud,,col,
1101,Curtiz,Robert Fellows,bnw,Warners


# Drop nulls

In [7]:
clean_elements.dropna(inplace=True)
clean_elements.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4065 entries, 0 to 14261
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   diretor   4065 non-null   object
 1   produtor  4065 non-null   object
 2   genero    4065 non-null   object
 3   estudio   4065 non-null   object
dtypes: object(4)
memory usage: 158.8+ KB


# Eliminando registros para labels (target) com frequencia <= a k% do label com maior frequência (Quando não é possui obter mais dados para balancear classes)

In [9]:
balance_target = fs_pp.remove_low_freq(clean_elements, target_name='estudio', threshold=0.25)
balance_target['estudio'].value_counts()

Shamley      353
Paramount    341
MGM          341
Fox          338
Warners      331
Universal    246
Columbia     192
U.A.         182
RKO          169
Name: estudio, dtype: int64

In [11]:
balance_diretor = fs_pp.remove_low_freq(balance_target, target_name='diretor', threshold=0.25)
balance_diretor['diretor'].value_counts()

R.Stevens        48
Hitchcock        43
Henreid          32
Daugherty        28
Cukor            28
N.Lloyd          23
Curtiz           23
Hathaway         20
J.Ford           19
Crosland~jr      19
M.LeRoy          18
Brahm            15
W.Lang           15
Wyler            13
R.Thorpe         13
J.Huston         13
Taurog           13
Neilson          13
H.King           13
S.Pollack        13
Koster           13
R.Fleischer      13
J.Newman         12
Kjellin          12
Minnelli         12
B.Wilder         12
G.Stevens~sr.    12
Dieterle         12
Name: diretor, dtype: int64

In [12]:
balance_produtor = fs_pp.remove_low_freq(balance_diretor, target_name='produtor', threshold=0.25)
balance_produtor['produtor'].value_counts()

Hitchcock    232
Name: produtor, dtype: int64

# Resetando Index

In [10]:
balance_produtor.reset_index(drop=True)

Unnamed: 0,diretor,produtor,genero,estudio
0,Se.Hicks,Lasky,sbw,Famous
1,Hitchcock,Hitchcock,sbw,Islington
2,Hitchcock,Balcon,sbw,B-S-F
3,Hitchcock,Balcon,sbw,Gainsborough
4,Hitchcock,Balcon,sbw,UFA
...,...,...,...,...
4060,Blanks,Neal Moritz,prc,Phoenix
4061,Katzenberg,Katzenberg,col,Dreamworks
4062,M.S.Johnson,Laurence Usher,col,Hollywood
4063,Dragojevic,Dragan Bjelogric,prc,SLeisure Time


# Feature selector

In [18]:
features = ['diretor','produtor','genero']
target = ['estudio']
fs_scores= fs_pp.feature_selection(balance_produtor,features,target,'cat_cat')

for i in range(len(fs_scores)):
    print('Feature %s: %f' % (features[i], fs_scores[i]))

  return f(**kwargs)


Feature diretor: 0.345194
Feature produtor: 0.009217
Feature genero: 0.104929


# Visualizando correlação feature_n x feature_m ~ target

In [42]:
def subplot_strip(Dataset,features,target):
  perm_features = list(combinations(features, 2))
  fig, axes = plt.subplots(len(perm_features),1,figsize=(10,len(perm_features)*10))
  
  for i, perm in enumerate(perm_features):
        sns.stripplot(ax=axes[i],data=Dataset,x=perm[0],y=perm[1], hue=target)
  plt.show()

In [43]:
fs_pp.subplot_strip(balance_produtor,features,'estudio')

In [None]:
Dados provavelmente não irão contribuir para convergencia do modelo!