In [3]:
from keras.applications.imagenet_utils import decode_predictions
from keras.applications import inception_v3
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
import numpy as np
import math
import glob
from PIL import Image


Using TensorFlow backend.


In [17]:
images = []
imodel = inception_v3.InceptionV3(weights='imagenet')
for file in glob.glob('img/*.jpg'):
    img = load_img(file, target_size=(299, 299))
    numpy_image = img_to_array(img)
    image_batch = np.expand_dims(numpy_image, axis=0)
    processed_image = inception_v3.preprocess_input(image_batch.copy())
    predictions = imodel.predict(processed_image)
    label = decode_predictions(predictions)
    images.append(label)

In [6]:
print(images)

[[[('n04228054', 'ski', 0.07556888), ('n03425413', 'gas_pump', 0.06668205), ('n03662601', 'lifeboat', 0.042928692), ('n03384352', 'forklift', 0.040784005), ('n02966687', "carpenter's_kit", 0.0322768)]], [[('n03425413', 'gas_pump', 0.4760827), ('n03032252', 'cinema', 0.18571566), ('n02977058', 'cash_machine', 0.03933277), ('n06874185', 'traffic_light', 0.03550076), ('n03982430', 'pool_table', 0.030363312)]], [[('n03459775', 'grille', 0.51086026), ('n04285008', 'sports_car', 0.09403146), ('n03271574', 'electric_fan', 0.021558167), ('n03127747', 'crash_helmet', 0.01486559), ('n03100240', 'convertible', 0.014169187)]], [[('n03602883', 'joystick', 0.97011065), ('n01629819', 'European_fire_salamander', 0.0010214719), ('n04286575', 'spotlight', 0.0007251861), ('n03691459', 'loudspeaker', 0.0005546969), ('n07747607', 'orange', 0.00035507817)]], [[('n04192698', 'shield', 0.102857195), ('n06794110', 'street_sign', 0.10229924), ('n02910353', 'buckle', 0.10128736), ('n03666591', 'lighter', 0.02638

In [12]:
import re
def clean_word(s):
    s = s.lower()
    cleanr = re.compile('<.*?>')
    s = re.sub(cleanr, ' ', s)
    s = re.sub(r'[?|!|\'|"|#]',r'',s)
    s = re.sub(r'[.|,|_|\-|)|(|\|/]',r' ',s)
    return s

In [13]:
documents = []
for img in images:
    tags = []
    for tupl in img:
        for words in tupl:
            w = clean_word(words[1])
            tags.append(w)
    documents.append(tags)
print(documents)

[['ski', 'gas pump', 'lifeboat', 'forklift', 'carpenters kit'], ['gas pump', 'cinema', 'cash machine', 'traffic light', 'pool table'], ['grille', 'sports car', 'electric fan', 'crash helmet', 'convertible'], ['joystick', 'european fire salamander', 'spotlight', 'loudspeaker', 'orange'], ['shield', 'street sign', 'buckle', 'lighter', 'web site'], ['grille', 'sports car', 'convertible', 'car wheel', 'cornet'], ['sports car', 'car wheel', 'convertible', 'grille', 'racer'], ['pickup', 'minivan', 'car wheel', 'grille', 'beach wagon'], ['racer', 'go kart', 'lawn mower', 'harvester', 'car wheel'], ['half track', 'buckle', 'cannon', 'purse', 'mailbag'], ['pier', 'submarine', 'suspension bridge', 'aircraft carrier', 'bullet train'], ['gas pump', 'golfcart', 'vacuum', 'power drill', 'minivan'], ['car wheel', 'coil', 'barrow', 'reflex camera', 'racer'], ['screw', 'whistle', 'corkscrew', 'nail', 'dumbbell'], ['seat belt', 'minivan', 'convertible', 'minibus', 'limousine'], ['car mirror', 'loupe', '

In [14]:
def create_dict_freq(documents):
    dict = {}
    for doc in documents:
        for word in doc:
            if word in dict.keys():
                dict[word]+=1
            else:
                dict[word]=1
    return dict

In [15]:
freq_dict = create_dict_freq(documents)

In [19]:
print(freq_dict)

{'ski': 1, 'gas pump': 3, 'lifeboat': 1, 'forklift': 1, 'carpenters kit': 1, 'cinema': 1, 'cash machine': 1, 'traffic light': 1, 'pool table': 1, 'grille': 5, 'sports car': 3, 'electric fan': 1, 'crash helmet': 1, 'convertible': 4, 'joystick': 1, 'european fire salamander': 1, 'spotlight': 1, 'loudspeaker': 1, 'orange': 1, 'shield': 1, 'street sign': 2, 'buckle': 3, 'lighter': 1, 'web site': 1, 'car wheel': 5, 'cornet': 1, 'racer': 3, 'pickup': 1, 'minivan': 3, 'beach wagon': 1, 'go kart': 1, 'lawn mower': 1, 'harvester': 1, 'half track': 1, 'cannon': 1, 'purse': 1, 'mailbag': 1, 'pier': 1, 'submarine': 1, 'suspension bridge': 1, 'aircraft carrier': 1, 'bullet train': 1, 'golfcart': 1, 'vacuum': 1, 'power drill': 1, 'coil': 1, 'barrow': 1, 'reflex camera': 1, 'screw': 1, 'whistle': 1, 'corkscrew': 1, 'nail': 1, 'dumbbell': 1, 'seat belt': 1, 'minibus': 1, 'limousine': 1, 'car mirror': 1, 'loupe': 1, 'baboon': 1, 'chainlink fence': 1, 'shopping basket': 1, 'odometer': 3, 'stopwatch': 2,

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

vect = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tfidf_matrix = vect.fit_transform(documents)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
print(df)


    aircraft carrier       alp  analog clock    baboon    barrow  beach wagon  \
0           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
1           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
2           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
3           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
4           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
5           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
6           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
7           0.000000  0.000000      0.000000  0.000000  0.000000     0.530789   
8           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
9           0.000000  0.000000      0.000000  0.000000  0.000000     0.000000   
10          0.447214  0.000000      0.000000  0.000000  0.000000     0.000000   
11          0.000000  0.0000

In [23]:
#combinazione delle analisi
def top_tfidf_feats(row, features, top_n=5):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i], freq_dict[f'{features[i]}']) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf', 'freq']
    return df

In [24]:
def top_feats_in_doc(Xtr, features, row_id, top_n=5):
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [26]:
print(top_feats_in_doc(tfidf_matrix, vect.get_feature_names(),5))

       feature     tfidf  freq
0       cornet  0.569544     1
1   sports car  0.453361     3
2  convertible  0.415958     4
3       grille  0.385398     5
4    car wheel  0.385398     5
