# Создание фичи с определением пола

In [3]:
import pandas as pd
import re

In [6]:
df_users = pd.read_parquet('user.parquet')
df_genders = df_users[['last_name', 'first_name']]
df_genders

Unnamed: 0,last_name,first_name
0,Анурьева,Арина
1,Xxx,Xxx
2,Луч,Красный
3,Laemsan,Wanny
4,Movies,Channel-Myanmar
...,...,...
145,Вишняков,Костя
146,Москвичёва,Саша
147,Магомедов,Омар
148,Савчук,Константин


In [7]:
df_users_genders = df_users[['first_name', 'last_name']]
df_users_genders['g_name'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_genders['g_name']=None


In [8]:
df_users_genders['full_name'] = df_users_genders.apply(
    lambda x: f"{x['first_name']} {x['last_name']}", axis=1
)

df_users_genders

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_genders['full_name'] = df_users_genders.apply(


Unnamed: 0,first_name,last_name,g_name,full_name
0,Арина,Анурьева,,Арина Анурьева
1,Xxx,Xxx,,Xxx Xxx
2,Красный,Луч,,Красный Луч
3,Wanny,Laemsan,,Wanny Laemsan
4,Channel-Myanmar,Movies,,Channel-Myanmar Movies
...,...,...,...,...
145,Костя,Вишняков,,Костя Вишняков
146,Саша,Москвичёва,,Саша Москвичёва
147,Омар,Магомедов,,Омар Магомедов
148,Константин,Савчук,,Константин Савчук


In [10]:
import torch
from transformers import pipeline
from tqdm import tqdm  # Для отображения прогресса
import pandas as pd

In [11]:
device = 0 if torch.cuda.is_available() else -1
print(f"Используется устройство: {'GPU' if device == 0 else 'CPU'}")

# Инициализация Zero-Shot Classification Pipeline с мультиязычной моделью
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device=0 if torch.cuda.is_available() else -1
)

Используется устройство: GPU


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [14]:
def predict_gender_batch(names, candidate_labels=["Male", "Female", "Unknown"], batch_size=32):
    """
    Предсказывает гендер для списка имен с использованием Zero-Shot Classification.
    
    Параметры:
    - names: список строк, каждое имя может содержать имя и/или фамилию.
    - candidate_labels: список категорий для классификации.
    - batch_size: размер батча для обработки.
    
    Возвращает:
    - Список предсказанных гендеров.
    """
    predictions = []
    for i in tqdm(range(0, len(names), batch_size), desc="Обработка батчей"):
        batch = names[i:i + batch_size]
        results = classifier(batch, candidate_labels, multi_label=False)
        # Если размер батча 1, результат — словарь, иначе — список словарей
        if isinstance(results, dict):
            results = [results]
        for result in results:
            label = result['labels'][0]
            score = result['scores'][0]
            if score < 0.7:
                predictions.append("Unknown")
            else:
                predictions.append(label)
    return predictions

In [18]:
# data = [
#         {"first_name": "Александр", "last_name": "Иванов"},     # Русское имя
#         {"first_name": "Maria", "last_name": "Smith"},          # Английское имя
#         {"first_name": "Ольга", "last_name": "Петрова"},        # Русское имя
#         {"first_name": "李", "last_name": "王"},                  # Китайское имя
#         {"first_name": "Марія", "last_name": "Шевченко"},        # Украинское имя
#         {"first_name": "Анна", "last_name": "Невская"},          # Русское имя
#     ]

# Подготовка списка полных имен
names = []
for index, row in df_users_genders.iterrows():
    first_name = row['first_name']
    last_name = row['last_name']
    if first_name and last_name:
        full_name = f"{first_name} {last_name}"
    elif first_name:
        full_name = first_name
    else:
        full_name = last_name
    names.append(full_name)

# Предсказание гендеров
predicted_genders = predict_gender_batch(names, batch_size=32)

df_users_genders['predicted_gender'] = predicted_genders

df_users_genders.to_parquet('genderslmao.parquet')

Обработка батчей: 100%|██████████| 5/5 [00:07<00:00,  1.54s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_genders['predicted_gender'] = predicted_genders


In [17]:
df_users_genders.head(150)

Unnamed: 0,first_name,last_name,g_name,full_name,predicted_gender
0,Арина,Анурьева,,Арина Анурьева,Female
1,Xxx,Xxx,,Xxx Xxx,Unknown
2,Красный,Луч,,Красный Луч,Unknown
3,Wanny,Laemsan,,Wanny Laemsan,Female
4,Channel-Myanmar,Movies,,Channel-Myanmar Movies,Unknown
...,...,...,...,...,...
145,Костя,Вишняков,,Костя Вишняков,Male
146,Саша,Москвичёва,,Саша Москвичёва,Female
147,Омар,Магомедов,,Омар Магомедов,Male
148,Константин,Савчук,,Константин Савчук,Male


In [108]:
df_users_genders['g_name'] = df_users_genders['predicted_gender'].apply(
    lambda x: 0 if x == 'Female' else (1 if x == 'Male' else 2))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_genders['g_name'] = df_users_genders['predicted_gender'].apply(lambda x: 0 if x == 'Female' else (1 if x == 'Male' else 2))


In [109]:
df_users_genders[['full_name', 'predicted_gender']].to_parquet('predicted_genders.parquet')

# Создание фичи с ктегориями фотографий

In [116]:
from os import uname_result

df_photo = pd.read_parquet("/kaggle/input/rararara/photo.parquet")
print(df_photo.columns)

for i in df_photo.columns:
    print(f"============{i}")
    print(df_photo[i].describe())
    print(df_photo[i].isna().value_counts())

for j in range(10):
    print(df_photo['igm2txt'][j], df_photo['url'][j])

df_photo["url"].describe()

Index(['id', 'user_id', 'igm2txt', 'url', 'like_count'], dtype='object')
count    2145.000000
mean     1073.000000
std       619.352484
min         1.000000
25%       537.000000
50%      1073.000000
75%      1609.000000
max      2145.000000
Name: id, dtype: float64
id
False    2145
Name: count, dtype: int64
count    2145.000000
mean      426.772028
std        42.397426
min       357.000000
25%       392.000000
50%       428.000000
75%       461.000000
max       506.000000
Name: user_id, dtype: float64
user_id
False    2145
Name: count, dtype: int64
count                                        2145
unique                                       1766
top       a photography of a woman with long hair
freq                                           25
Name: igm2txt, dtype: object
igm2txt
False    2145
Name: count, dtype: int64
count                                                  2145
unique                                                 2135
top       https://sun9-28.userapi.com/s/v1/if2/f

count                                                  2145
unique                                                 2135
top       https://sun9-28.userapi.com/s/v1/if2/fq_nUsQL1...
freq                                                      4
Name: url, dtype: object

In [117]:
df_photo['igm2txt'] = df_photo['igm2txt'].str.slice(start=18)

In [118]:
def normalize_text(text):
    if isinstance(text, str):
        text = re.sub(r'<br>', ' ', text)
        #text = ''.join(char for char in text if char.isalnum() or char.isspace() or char in emoji.EMOJI_DATA.values())
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        # Замена нерусских символов пустыми строками
        #text = ''.join([char if ord(char) < 128 else '' for char in text])

        return text
    return ''


df_photo['igm2txt'].apply(normalize_text)

0       female character in a red dress surrounded by ...
1                                 vampire holding a knife
2       wo girls in a red chair with a butterfly on th...
3                   girl with long hair and a black dress
4                  girl in a black dress with a red caper
                              ...                        
2140                  man sitting on a bench in the woods
2141    woman taking a self self self self self self s...
2142                          woman with long blonde hair
2143    woman in a black jacket standing on a hill in ...
2144    woman standing on a bridge with a sunset in th...
Name: igm2txt, Length: 2145, dtype: object

In [119]:
categories = [
    "Nature photo", "Travel", "Music", "Sports", "Study", "Books", "Psychology", "Lifehacks", "Health", "Food",
    "Fitness", "Cooking", "Movies", "Theater", "Science", "History", "Technology", "Programming", "Design",
    "Beauty of appearance", "Cars", "Video Games", "Streams", "Comics", "Anime", "Violence", "Sports Teams",
    "Gardening", "News", "Animals",
    "Volunteering", "Politics", "Economics", "Finance", "Investments", "Cryptocurrency", "Advertising", "Scam", "Spam",
    "Drawing",
    "Sculpture", "Origami",
    "DIY (Do It Yourself)", "Funny Memes", "Sad vibe", "Sewing", "Dancing", "", "Fantasy",
    "Role-Playing Games", "Parties", "Pornography",
    "Literature Clubs", "Poetry", "Outdoor Activities",
    "Meditation",
    "Dangerous hobbies", "Martial Arts", "Musical instruments",
    "Relationship Psychology", "Self-Development", "Family", "Language learning",
    "Renovation", "Podcasts", "Scientific Discoveries", "Religion", "Mysticism",
    "Contests and Giveaways",
    "Adult Content", "Communities for Enthusiasts", "Teen Problems",
    "Interest Groups",
    "Computer Games", "Sketches", "Bloggers", "Flashmobs", "Mutual Help and Tips",
    "Trends and Challenges", "College and Student Life", "Dating", "Anonymous",
    "Podcasts", "Education", "Online Courses", "Editing"
]

categories = [i.lower() for i in categories]

In [26]:
%pip install langchain_chroma >> None
%pip install langchain_text_splitters >> None
%pip install langchain langchain-community chromadb sentence-transformers >> None

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [27]:
from langchain_text_splitters import CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from chromadb.config import Settings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # Модель для эмбеддингов
    model_kwargs={"device": device})

from langchain_chroma import Chroma

documents = [Document(page_content=category) for category in categories]
db = Chroma.from_documents(documents, embeddings)

  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [35]:
!wget -O GoogleNews-vectors-negative300.bin "https://figshare.com/ndownloader/files/10798046"


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-10-20 08:10:42--  https://figshare.com/ndownloader/files/10798046
Resolving figshare.com (figshare.com)... 52.48.45.27, 54.77.229.210, 2a05:d018:1f4:d003:6607:cc51:928e:a3d3, ...
Connecting to figshare.com (figshare.com)|52.48.45.27|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10798046/GoogleNewsvectorsnegative300.bin?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20241020/eu-west-1/s3/aws4_request&X-Amz-Date=20241020T081042Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=9a14a618c7388ee1fa26eb02965603674a0af98fe63c2a9789c7aa9e60701d78 [following]
--2024-10-20 08:10:42--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10798046/GoogleNewsvectorsnegative300.bin?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20241020/eu-west-1/s3/aws4_request&X-Amz-Date=20241020T081042Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=9a14a

In [120]:
def find_category(query: str):
    results = db.similarity_search(query=query, k=3)
    results = [el.page_content for el in results]
    return results

In [121]:
df_photo['predicted_top_categories'] = None

for index, row in tqdm(df_photo.iterrows(), total=len(df_photo)):
    df_photo.at[index, 'predicted_top_categories'] = find_category(row['igm2txt'])

100%|██████████| 2145/2145 [00:24<00:00, 89.28it/s]


In [122]:
df_photo.head(20)

Unnamed: 0,id,user_id,igm2txt,url,like_count,predicted_top_categories
0,1,357,female character in a red dress surrounded by...,https://sun9-65.userapi.com/s/v1/if1/wwgdoaSH3...,104,"[anime, beauty of appearance, sculpture]"
1,2,357,vampire holding a knife,https://sun9-8.userapi.com/s/v1/if1/gssFXziHQw...,104,"[sewing, sculpture, mysticism]"
2,3,357,wo girls in a red chair with a butterfly on th...,https://sun9-24.userapi.com/s/v1/if1/Od_UayMZi...,107,"[teen problems, parties, beauty of appearance]"
3,4,357,girl with long hair and a black dress,https://sun9-68.userapi.com/s/v1/if1/0x65XShbd...,116,"[beauty of appearance, sewing, anime]"
4,5,357,girl in a black dress with a red caper,https://sun9-48.userapi.com/s/v1/if1/0ft6LUT4a...,133,"[beauty of appearance, anonymous, sewing]"
5,6,357,couple kissing in the dark,https://sun9-8.userapi.com/s/v1/if1/wDsSShNHYD...,145,"[relationship psychology, dating, fantasy]"
6,7,357,he anime girls from the anime manga series,https://sun9-77.userapi.com/s/v1/if1/J5xaD_a7o...,213,"[anime, comics, martial arts]"
7,8,357,he anime character from the anime series,https://sun9-31.userapi.com/s/v1/if1/7sFBJ_6yS...,130,"[anime, comics, origami]"
8,9,357,woman with long black hair and a red lipstick,https://sun9-3.userapi.com/s/v1/if1/NWZs5NVlsX...,124,"[beauty of appearance, dating, relationship ps..."
9,10,357,wo cute anime girls in white dresses with yell...,https://sun9-44.userapi.com/s/v1/if1/jEtTM0wBC...,110,"[anime, sketches, comics]"


In [123]:
df_photo.to_parquet('test_df_predicted_categories.parquet')