# Make a Word Cloud from Data Science terms

We will list terms with their weight, configure a word cloud, display and save as SVG.

## Word Cloud parameters

[Documentation for the Wordcloud class can be found here.](https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html#wordcloud.WordCloud)

In [None]:
wordcloud_config = dict(
    max_words=2000,

    max_font_size=100,
    width=1600,
    height=900,
    prefer_horizontal=0.5,
    font_step=1,
    mode='RGBA',
    background_color=None,
    random_state=42,

    # 0 means the frequencies in the data are reflected less
    # acurately but it makes a better picture
    relative_scaling=0,
)

## Data Science terms for the word cloud
We are breaking in categories only because we are obsessed with data classification.

The weight controls the size of the terms in the cloud. Maximum size is 10. Terms with no weight will be filled by a random numbers up to 8.

In [None]:
#       (weight,     term)
terms = dict(
    Algorithm = [
        (None,      'Decision Tree'),
        (None,      'Gradient Boosting'),
        (None,      'Hierarchical Clustering'),
        (None,      'Naive Bayes'),
        (None,      'Ensembles'),
        (None,      'Hyper-parameters'),
        (None,      'K-Means'),
        (None,      'Regression'),
        (None,      'ElasticNet'),
        (None,      'Classification'),
        (None,      'Logit'),
        (None,      'fit()'),
        (None,      'predict()'),
        (None,      'Optuna'),
        (10,        'Random Forest'),
        (None,      'Support Vector Machine'),
        (10,        'XGBoost'),
        (None,      'ARIMA'),
        (None,      'Neural Network'),
    ],
    Concept = [
        (None,      'Artificial Intelligence'),
        (None,      'Data Pipeline'),
        (10,        'Machine Learning'),
        (None,      'Predictive Analytics'),
        (None,      'Supervised learning'),
        (None,      'Unsupervised learning'),
        (None,      'Time series'),
        (10,        'Ŷ'),
    ],
    Format = [
        (None,      'CSV'),
        (None,      'JSON'),
        (10,        'Parquet'),
        (None,      'Pickle'),
    ],
    Library = [
        (None,      'BeautifulSoup'),
        (None,      'Matplotlib'),
        (None,      'NumPy'),
        (10,        'Pandas'),
        (10,        'Scikit Learn'),
        (None,      'SciPy'),
        (None,      'Seaborn'),
        (None,      'Tensor Flow'),
    ],
    Metric = [
        (None,      'Accuracy'),
        (None,      'Confusion Matrix'),
        (None,      'True positive'),
        (None,      'True negative'),
        (None,      'False positive'),
        (None,      'False negative'),
        (10,        'Correlation Matrix'),
        (None,      'Kolmogorov-Smirnov'),
        (9,         'F1 Score'),
        (None,      'Precision'),
        (None,      'Quantile'),
        (None,      'Recall'),
        (9,         'ROC Curve'),
        (None,      'Area under curve'),
        (None,      'Average'),
        (None,      'Median'),
        (None,      'Mean squared error'),
        (None,      'Mean squared logarithmic error'),
        (10,        'R²'),
        (None,      'Covariance'),
    ],
    Practice = [
        (None,      'Feature engineering'),
        (None,      'Feature selection'),
        (None,      'Hypothesis testing'),
        (None,      'Aggregation'),
        (None,      'Anonymization'),
        (None,      'Data cleansing'),
        (None,      'Data augmentation'),
        (None,      'Compression'),
        (None,      'Decryption'),
        (None,      'Encryption'),
        (None,      'Data Enrichment'),
        (None,      'Ethics'),
        (None,      'Normalization'),
        (None,      'Quality'),
        (None,      'Replication'),
        (9,         'Token'),
        (None,      'Data wrangling'),
        (None,      'Feature Engineering'),
        (None,      'Missing Data Imputation'),
        (None,      'Model Evaluation'),
        (None,      'Outlier'),
        (None,      'Train-Test Split'),
        (None,      'Stratified KFold'),
        (None,      'Web scraping'),
    ],
    Process = [
        (None,      'ETL'),
        (None,      'Batch processing'),
        (None,      'Cross validation'),
        (10,        'Drift detection'),
    ],
    Technique = [
        (None,      'Clustering'),
        (None,      'Principal Component Analysis'),
        (None,      'One-hot encoding'),
    ],
    Tool = [
        (None,      'Big Data'),
        (None,      'Data Lake'),
        (None,      'API'),
        (None,      'Data Catalog'),
        (10,        'Jupyter Notebook'),
        (None,      'Spark'),
        (9,         'SQL'),
        (None,      'Logarithm'),
        (None,      'BoxCox'),
    ],
    Visualization = [
        (None,      'Box plot'),
        (None,      'Histogram'),
        (9,         'Gaussian distribution'),
        (10,        'Normal distribution'),
        (None,      'Scatter plot'),
    ]
)

## Logic to make the word cloud

In [None]:
import matplotlib
import numpy
import pandas
import wordcloud

Fill the gaps and convert it something usable by wordcloud library.

In [None]:
terms = (
    pandas.concat([pandas.DataFrame(terms[t],columns='weight term'.split()) for t in terms])

    # Delete recurrent index
    .reset_index(drop=True)

    # Weight is integer
    .astype(dict(weight=pandas.Int8Dtype()))

    # Fill undefined weights with a random number
    .assign(
        weight=lambda table: table.weight.combine_first(pandas.Series(numpy.random.default_rng().integers(1, 8, size=len(table.weight))))
    )

    # Convert to format usable by wordcloud library: {term: weight}
    .set_index('term')
    .to_dict()
)
# terms

Make the word cloud

In [None]:
cloud=wordcloud.WordCloud(**wordcloud_config)
cloud.generate_from_frequencies(terms['weight'])

Display it in the notebook

In [None]:
matplotlib.pyplot.figure(figsize=(16,9))
matplotlib.pyplot.imshow(cloud, interpolation="bilinear")
matplotlib.pyplot.axis(False)
matplotlib.pyplot.show()

Save as SVG file

In [None]:
with open("Data Science Word Cloud.svg","w+") as f:
    f.write(cloud.to_svg(embed_font=True))